freedreno: move ir3 to common location

author Rob Clark <robdclark@gmail.com>

Sat, 10 Nov 2018 17:05:59 +0000 (12:05 -0500)

committer Rob Clark <robdclark@gmail.com>

Tue, 27 Nov 2018 20:44:02 +0000 (15:44 -0500)
author Rob Clark <robdclark@gmail.com>
Sat, 10 Nov 2018 17:05:59 +0000 (12:05 -0500)
committer Rob Clark <robdclark@gmail.com>
Tue, 27 Nov 2018 20:44:02 +0000 (15:44 -0500)
diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am

index 9ddc3c0ad3593e51677afa7b3ded6d0d7854e96d..8f027e34f8ae1e0739d9dbd65a84e0db30db0ea1 100644 (file)
--- a/src/freedreno/Makefile.am
+++ b/src/freedreno/Makefile.am
@@ -45,7 +45,8 @@ TESTS =
  BUILT_SOURCES =
  CLEANFILES =
  EXTRA_DIST = \
-       drm/meson.build
+       drm/meson.build \
+       ir3/meson.build
  
  MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
  PYTHON_GEN = $(AM_V_GEN)$(PYTHON) $(PYTHON_FLAGS)
@@ -57,3 +58,19 @@ noinst_LTLIBRARIES += libfreedreno_drm.la
  libfreedreno_drm_la_SOURCES = $(drm_SOURCES)
  libfreedreno_drm_la_CFLAGS = $(VALGRIND_CFLAGS) $(LIBDRM_CFLAGS)
  
+noinst_LTLIBRARIES += libfreedreno_ir3.la
+
+libfreedreno_ir3_la_SOURCES = $(ir3_SOURCES) $(ir3_GENERATED_FILES)
+libfreedreno_ir3_la_CFLAGS = \
+       -I$(top_srcdir)/src/freedreno/ir3 \
+       -I$(top_builddir)/src/compiler/nir \
+       -I$(top_srcdir)/src/compiler/nir
+libfreedreno_ir3_LIBADD = \
+       $(top_builddir)/src/compiler/nir/libnir.la \
+       $(top_builddir)/src/util/libmesautil.la
+
+MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
+ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
+       $(MKDIR_GEN)
+       $(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
+
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources

index 06a1a99b9e2218489d79f5932389ba46505f0614..1df5e6250b5c3cd86274bb6e7164901ef0773045 100644 (file)
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -15,3 +15,27 @@ drm_SOURCES := \
         drm/msm_drm.h \
         drm/msm_ringbuffer.c
  
+ir3_SOURCES := \
+       ir3/disasm-a3xx.c \
+       ir3/instr-a3xx.h \
+       ir3/ir3.c \
+       ir3/ir3_compiler.c \
+       ir3/ir3_compiler.h \
+       ir3/ir3_compiler_nir.c \
+       ir3/ir3_cp.c \
+       ir3/ir3_depth.c \
+       ir3/ir3_group.c \
+       ir3/ir3.h \
+       ir3/ir3_legalize.c \
+       ir3/ir3_nir.c \
+       ir3/ir3_nir.h \
+       ir3/ir3_nir_lower_tg4_to_tex.c \
+       ir3/ir3_print.c \
+       ir3/ir3_ra.c \
+       ir3/ir3_sched.c \
+       ir3/ir3_shader.c \
+       ir3/ir3_shader.h
+
+ir3_GENERATED_FILES := \
+       ir3/ir3_nir_trig.c
+
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c

new file mode 100644 (file)

index 0000000..4cf45ce
--- /dev/null
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <util/u_debug.h>
+
+#include "instr-a3xx.h"
+
+/* bitmask of debug flags */
+enum debug_t {
+       PRINT_RAW      = 0x1,    /* dump raw hexdump */
+       PRINT_VERBOSE  = 0x2,
+};
+
+static enum debug_t debug;
+
+#define printf debug_printf
+
+static const char *levels[] = {
+               "",
+               "\t",
+               "\t\t",
+               "\t\t\t",
+               "\t\t\t\t",
+               "\t\t\t\t\t",
+               "\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t",
+               "\t\t\t\t\t\t\t\t\t",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+               "x",
+};
+
+static const char *component = "xyzw";
+
+static const char *type[] = {
+               [TYPE_F16] = "f16",
+               [TYPE_F32] = "f32",
+               [TYPE_U16] = "u16",
+               [TYPE_U32] = "u32",
+               [TYPE_S16] = "s16",
+               [TYPE_S32] = "s32",
+               [TYPE_U8]  = "u8",
+               [TYPE_S8]  = "s8",
+};
+
+struct disasm_ctx {
+       FILE *out;
+       int level;
+
+       /* current instruction repeat flag: */
+       unsigned repeat;
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+               bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+       const char type = c ? 'c' : 'r';
+
+       // XXX I prefer - and || for neg/abs, but preserving format used
+       // by libllvm-a3xx for easy diffing..
+
+       if (abs && neg)
+               fprintf(ctx->out, "(absneg)");
+       else if (neg)
+               fprintf(ctx->out, "(neg)");
+       else if (abs)
+               fprintf(ctx->out, "(abs)");
+
+       if (r)
+               fprintf(ctx->out, "(r)");
+
+       if (im) {
+               fprintf(ctx->out, "%d", reg.iim_val);
+       } else if (addr_rel) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               if (reg.iim_val < 0)
+                       fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+               else if (reg.iim_val > 0)
+                       fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+               else
+                       fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
+       } else if ((reg.num == REG_A0) && !c) {
+               fprintf(ctx->out, "a0.%c", component[reg.comp]);
+       } else if ((reg.num == REG_P0) && !c) {
+               fprintf(ctx->out, "p0.%c", component[reg.comp]);
+       } else {
+               fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
+       }
+}
+
+
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
+{
+       print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
+}
+
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+               bool c, bool im, bool neg, bool abs, bool addr_rel)
+{
+       print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
+}
+
+/* TODO switch to using reginfo struct everywhere, since more readable
+ * than passing a bunch of bools to print_reg_src
+ */
+
+struct reginfo {
+       reg_t reg;
+       bool full;
+       bool r;
+       bool c;
+       bool im;
+       bool neg;
+       bool abs;
+       bool addr_rel;
+};
+
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
+{
+       print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
+                       info->neg, info->abs, info->addr_rel);
+}
+
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
+//{
+//     print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
+//}
+
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat0_t *cat0 = &instr->cat0;
+
+       switch (cat0->opc) {
+       case OPC_KILL:
+               fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
+                               component[cat0->comp]);
+               break;
+       case OPC_BR:
+               fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
+                               component[cat0->comp], cat0->a3xx.immed);
+               break;
+       case OPC_JUMP:
+       case OPC_CALL:
+               fprintf(ctx->out, " #%d", cat0->a3xx.immed);
+               break;
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
+               fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
+}
+
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat1_t *cat1 = &instr->cat1;
+
+       if (cat1->ul)
+               fprintf(ctx->out, "(ul)");
+
+       if (cat1->src_type == cat1->dst_type) {
+               if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
+                       /* special case (nmemonic?): */
+                       fprintf(ctx->out, "mova");
+               } else {
+                       fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+               }
+       } else {
+               fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+       }
+
+       fprintf(ctx->out, " ");
+
+       if (cat1->even)
+               fprintf(ctx->out, "(even)");
+
+       if (cat1->pos_inf)
+               fprintf(ctx->out, "(pos_infinity)");
+
+       print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+                       cat1->dst_rel);
+
+       fprintf(ctx->out, ", ");
+
+       /* ugg, have to special case this.. vs print_reg().. */
+       if (cat1->src_im) {
+               if (type_float(cat1->src_type))
+                       fprintf(ctx->out, "(%f)", cat1->fim_val);
+               else if (type_uint(cat1->src_type))
+                       fprintf(ctx->out, "0x%08x", cat1->uim_val);
+               else
+                       fprintf(ctx->out, "%d", cat1->iim_val);
+       } else if (cat1->src_rel && !cat1->src_c) {
+               /* I would just use %+d but trying to make it diff'able with
+                * libllvm-a3xx...
+                */
+               char type = cat1->src_rel_c ? 'c' : 'r';
+               if (cat1->off < 0)
+                       fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
+               else if (cat1->off > 0)
+                       fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
+               else
+                       fprintf(ctx->out, "%c<a0.x>", type);
+       } else {
+               print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+                               cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
+               fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
+}
+
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat2_t *cat2 = &instr->cat2;
+       static const char *cond[] = {
+                       "lt",
+                       "le",
+                       "gt",
+                       "ge",
+                       "eq",
+                       "ne",
+                       "?6?",
+       };
+
+       switch (_OPC(2, cat2->opc)) {
+       case OPC_CMPS_F:
+       case OPC_CMPS_U:
+       case OPC_CMPS_S:
+       case OPC_CMPV_F:
+       case OPC_CMPV_U:
+       case OPC_CMPV_S:
+               fprintf(ctx->out, ".%s", cond[cat2->cond]);
+               break;
+       }
+
+       fprintf(ctx->out, " ");
+       if (cat2->ei)
+               fprintf(ctx->out, "(ei)");
+       print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+       fprintf(ctx->out, ", ");
+
+       if (cat2->c1.src1_c) {
+               print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+                               cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, false);
+       } else if (cat2->rel1.src1_rel) {
+               print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+                               cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, cat2->rel1.src1_rel);
+       } else {
+               print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+                               false, cat2->src1_im, cat2->src1_neg,
+                               cat2->src1_abs, false);
+       }
+
+       switch (_OPC(2, cat2->opc)) {
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+       case OPC_CLZ_B:
+       case OPC_CLZ_S:
+       case OPC_SIGN_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_NOT_B:
+       case OPC_BFREV_B:
+       case OPC_SETRM:
+       case OPC_CBITS_B:
+               /* these only have one src reg */
+               break;
+       default:
+               fprintf(ctx->out, ", ");
+               if (cat2->c2.src2_c) {
+                       print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+                                       cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, false);
+               } else if (cat2->rel2.src2_rel) {
+                       print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+                                       cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, cat2->rel2.src2_rel);
+               } else {
+                       print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+                                       false, cat2->src2_im, cat2->src2_neg,
+                                       cat2->src2_abs, false);
+               }
+               break;
+       }
+}
+
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat3_t *cat3 = &instr->cat3;
+       bool full = instr_cat3_full(cat3);
+
+       fprintf(ctx->out, " ");
+       print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+       fprintf(ctx->out, ", ");
+       if (cat3->c1.src1_c) {
+               print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
+                               cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
+                               false, false);
+       } else if (cat3->rel1.src1_rel) {
+               print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
+                               cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
+                               false, cat3->rel1.src1_rel);
+       } else {
+               print_reg_src(ctx, (reg_t)(cat3->src1), full,
+                               cat3->src1_r, false, false, cat3->src1_neg,
+                               false, false);
+       }
+       fprintf(ctx->out, ", ");
+       print_reg_src(ctx, (reg_t)cat3->src2, full,
+                       cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
+                       false, false);
+       fprintf(ctx->out, ", ");
+       if (cat3->c2.src3_c) {
+               print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
+                               cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
+                               false, false);
+       } else if (cat3->rel2.src3_rel) {
+               print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
+                               cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
+                               false, cat3->rel2.src3_rel);
+       } else {
+               print_reg_src(ctx, (reg_t)(cat3->src3), full,
+                               cat3->src3_r, false, false, cat3->src3_neg,
+                               false, false);
+       }
+}
+
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat4_t *cat4 = &instr->cat4;
+
+       fprintf(ctx->out, " ");
+       print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+       fprintf(ctx->out, ", ");
+
+       if (cat4->c.src_c) {
+               print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
+                               cat4->src_r, cat4->c.src_c, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, false);
+       } else if (cat4->rel.src_rel) {
+               print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
+                               cat4->src_r, cat4->rel.src_c, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
+       } else {
+               print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
+                               cat4->src_r, false, cat4->src_im,
+                               cat4->src_neg, cat4->src_abs, false);
+       }
+
+       if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
+               fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+}
+
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
+{
+       static const struct {
+               bool src1, src2, samp, tex;
+       } info[0x1f] = {
+                       [opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
+                       [opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
+                       [opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
+                       [opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
+                       [opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
+                       [opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
+                       [opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
+                       [opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
+                       [opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
+                       [opc_op(OPC_DSX)]      = { true,  false, false, false, },
+                       [opc_op(OPC_DSY)]      = { true,  false, false, false, },
+                       [opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
+                       [opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
+                       [opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
+                       [opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
+                       [opc_op(OPC_RGETINFO)] = { false, false, false, false, },
+       };
+       instr_cat5_t *cat5 = &instr->cat5;
+       int i;
+
+       if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+       if (cat5->is_a)    fprintf(ctx->out, ".a");
+       if (cat5->is_o)    fprintf(ctx->out, ".o");
+       if (cat5->is_p)    fprintf(ctx->out, ".p");
+       if (cat5->is_s)    fprintf(ctx->out, ".s");
+       if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
+
+       fprintf(ctx->out, " ");
+
+       switch (_OPC(5, cat5->opc)) {
+       case OPC_DSXPP_1:
+       case OPC_DSYPP_1:
+               break;
+       default:
+               fprintf(ctx->out, "(%s)", type[cat5->type]);
+               break;
+       }
+
+       fprintf(ctx->out, "(");
+       for (i = 0; i < 4; i++)
+               if (cat5->wrmask & (1 << i))
+                       fprintf(ctx->out, "%c", "xyzw"[i]);
+       fprintf(ctx->out, ")");
+
+       print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+
+       if (info[cat5->opc].src1) {
+               fprintf(ctx->out, ", ");
+               print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
+                               false, false, false);
+       }
+
+       if (cat5->is_s2en) {
+               fprintf(ctx->out, ", ");
+               print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+                               false, false, false);
+               fprintf(ctx->out, ", ");
+               print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
+                               false, false, false);
+       } else {
+               if (cat5->is_o || info[cat5->opc].src2) {
+                       fprintf(ctx->out, ", ");
+                       print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
+                                       false, false, false, false, false, false);
+               }
+               if (info[cat5->opc].samp)
+                       fprintf(ctx->out, ", s#%d", cat5->norm.samp);
+               if (info[cat5->opc].tex)
+                       fprintf(ctx->out, ", t#%d", cat5->norm.tex);
+       }
+
+       if (debug & PRINT_VERBOSE) {
+               if (cat5->is_s2en) {
+                       if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
+                               fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+               } else {
+                       if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
+                               fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+               }
+       }
+}
+
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat6_t *cat6 = &instr->cat6;
+       char sd = 0, ss = 0;  /* dst/src address space */
+       bool nodst = false;
+       struct reginfo dst, src1, src2;
+       int src1off = 0, dstoff = 0;
+
+       memset(&dst, 0, sizeof(dst));
+       memset(&src1, 0, sizeof(src1));
+       memset(&src2, 0, sizeof(src2));
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_RESINFO:
+       case OPC_RESFMT:
+               dst.full  = type_size(cat6->type) == 32;
+               src1.full = type_size(cat6->type) == 32;
+               src2.full = type_size(cat6->type) == 32;
+               break;
+       case OPC_L2G:
+       case OPC_G2L:
+               dst.full = true;
+               src1.full = true;
+               src2.full = true;
+               break;
+       case OPC_STG:
+       case OPC_STL:
+       case OPC_STP:
+       case OPC_STI:
+       case OPC_STLW:
+       case OPC_STIB:
+               dst.full  = true;
+               src1.full = type_size(cat6->type) == 32;
+               src2.full = type_size(cat6->type) == 32;
+               break;
+       default:
+               dst.full  = type_size(cat6->type) == 32;
+               src1.full = true;
+               src2.full = true;
+               break;
+       }
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_PREFETCH:
+               break;
+       case OPC_RESINFO:
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               break;
+       case OPC_LDGB:
+               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+               break;
+       case OPC_STGB:
+       case OPC_STIB:
+               fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
+               break;
+       case OPC_ATOMIC_ADD:
+       case OPC_ATOMIC_SUB:
+       case OPC_ATOMIC_XCHG:
+       case OPC_ATOMIC_INC:
+       case OPC_ATOMIC_DEC:
+       case OPC_ATOMIC_CMPXCHG:
+       case OPC_ATOMIC_MIN:
+       case OPC_ATOMIC_MAX:
+       case OPC_ATOMIC_AND:
+       case OPC_ATOMIC_OR:
+       case OPC_ATOMIC_XOR:
+               ss = cat6->g ? 'g' : 'l';
+               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+               fprintf(ctx->out, ".%c", ss);
+               break;
+       default:
+               dst.im = cat6->g && !cat6->dst_off;
+               fprintf(ctx->out, ".%s", type[cat6->type]);
+               break;
+       }
+       fprintf(ctx->out, " ");
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_STG:
+               sd = 'g';
+               break;
+       case OPC_STP:
+               sd = 'p';
+               break;
+       case OPC_STL:
+       case OPC_STLW:
+               sd = 'l';
+               break;
+
+       case OPC_LDG:
+       case OPC_LDC:
+               ss = 'g';
+               break;
+       case OPC_LDP:
+               ss = 'p';
+               break;
+       case OPC_LDL:
+       case OPC_LDLW:
+       case OPC_LDLV:
+               ss = 'l';
+               break;
+
+       case OPC_L2G:
+               ss = 'l';
+               sd = 'g';
+               break;
+
+       case OPC_G2L:
+               ss = 'g';
+               sd = 'l';
+               break;
+
+       case OPC_PREFETCH:
+               ss = 'g';
+               nodst = true;
+               break;
+
+       case OPC_STI:
+               dst.full = false;  // XXX or inverts??
+               break;
+       }
+
+       if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
+               struct reginfo src3;
+
+               memset(&src3, 0, sizeof(src3));
+
+               src1.reg = (reg_t)(cat6->stgb.src1);
+               src2.reg = (reg_t)(cat6->stgb.src2);
+               src2.im  = cat6->stgb.src2_im;
+               src3.reg = (reg_t)(cat6->stgb.src3);
+               src3.im  = cat6->stgb.src3_im;
+               src3.full = true;
+
+               fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+               print_src(ctx, &src1);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src3);
+
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+
+               return;
+       }
+
+       if (is_atomic(_OPC(6, cat6->opc))) {
+
+               src1.reg = (reg_t)(cat6->ldgb.src1);
+               src1.im  = cat6->ldgb.src1_im;
+               src2.reg = (reg_t)(cat6->ldgb.src2);
+               src2.im  = cat6->ldgb.src2_im;
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               if (ss == 'g') {
+                       struct reginfo src3;
+                       memset(&src3, 0, sizeof(src3));
+
+                       src3.reg = (reg_t)(cat6->ldgb.src3);
+                       src3.full = true;
+
+                       /* For images, the ".typed" variant is used and src2 is
+                        * the ivecN coordinates, ie ivec2 for 2d.
+                        *
+                        * For SSBOs, the ".untyped" variant is used and src2 is
+                        * a simple dword offset..  src3 appears to be
+                        * uvec2(offset * 4, 0).  Not sure the point of that.
+                        */
+
+                       fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+                       print_src(ctx, &src1);  /* value */
+                       fprintf(ctx->out, ", ");
+                       print_src(ctx, &src2);  /* offset/coords */
+                       fprintf(ctx->out, ", ");
+                       print_src(ctx, &src3);  /* 64b byte offset.. */
+
+                       if (debug & PRINT_VERBOSE) {
+                               fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+                       }
+               } else { /* ss == 'l' */
+                       fprintf(ctx->out, "l[");
+                       print_src(ctx, &src1);  /* simple byte offset */
+                       fprintf(ctx->out, "], ");
+                       print_src(ctx, &src2);  /* value */
+
+                       if (debug & PRINT_VERBOSE) {
+                               fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+                                               cat6->ldgb.src3, cat6->ldgb.pad0,
+                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+                       }
+               }
+
+               return;
+       } else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
+
+               return;
+       } else if (_OPC(6, cat6->opc) == OPC_LDGB) {
+
+               src1.reg = (reg_t)(cat6->ldgb.src1);
+               src1.im  = cat6->ldgb.src1_im;
+               src2.reg = (reg_t)(cat6->ldgb.src2);
+               src2.im  = cat6->ldgb.src2_im;
+               dst.reg  = (reg_t)(cat6->ldgb.dst);
+
+               print_src(ctx, &dst);
+               fprintf(ctx->out, ", ");
+               fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+               print_src(ctx, &src1);
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+
+               if (debug & PRINT_VERBOSE)
+                       fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+
+               return;
+       }
+       if (cat6->dst_off) {
+               dst.reg = (reg_t)(cat6->c.dst);
+               dstoff  = cat6->c.off;
+       } else {
+               dst.reg = (reg_t)(cat6->d.dst);
+       }
+
+       if (cat6->src_off) {
+               src1.reg = (reg_t)(cat6->a.src1);
+               src1.im  = cat6->a.src1_im;
+               src2.reg = (reg_t)(cat6->a.src2);
+               src2.im  = cat6->a.src2_im;
+               src1off  = cat6->a.off;
+       } else {
+               src1.reg = (reg_t)(cat6->b.src1);
+               src1.im  = cat6->b.src1_im;
+               src2.reg = (reg_t)(cat6->b.src2);
+               src2.im  = cat6->b.src2_im;
+       }
+
+       if (!nodst) {
+               if (sd)
+                       fprintf(ctx->out, "%c[", sd);
+               /* note: dst might actually be a src (ie. address to store to) */
+               print_src(ctx, &dst);
+               if (dstoff)
+                       fprintf(ctx->out, "%+d", dstoff);
+               if (sd)
+                       fprintf(ctx->out, "]");
+               fprintf(ctx->out, ", ");
+       }
+
+       if (ss)
+               fprintf(ctx->out, "%c[", ss);
+
+       /* can have a larger than normal immed, so hack: */
+       if (src1.im) {
+               fprintf(ctx->out, "%u", src1.reg.dummy13);
+       } else {
+               print_src(ctx, &src1);
+       }
+
+       if (src1off)
+               fprintf(ctx->out, "%+d", src1off);
+       if (ss)
+               fprintf(ctx->out, "]");
+
+       switch (_OPC(6, cat6->opc)) {
+       case OPC_RESINFO:
+       case OPC_RESFMT:
+               break;
+       default:
+               fprintf(ctx->out, ", ");
+               print_src(ctx, &src2);
+               break;
+       }
+}
+
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
+{
+       instr_cat7_t *cat7 = &instr->cat7;
+
+       if (cat7->g)
+               fprintf(ctx->out, ".g");
+       if (cat7->l)
+               fprintf(ctx->out, ".l");
+
+       if (_OPC(7, cat7->opc) == OPC_FENCE) {
+               if (cat7->r)
+                       fprintf(ctx->out, ".r");
+               if (cat7->w)
+                       fprintf(ctx->out, ".w");
+       }
+}
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+static const struct opc_info {
+       uint16_t cat;
+       uint16_t opc;
+       const char *name;
+       void (*print)(struct disasm_ctx *ctx, instr_t *instr);
+} opcs[1 << (3+NOPC_BITS)] = {
+#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
+       /* category 0: */
+       OPC(0, OPC_NOP,          nop),
+       OPC(0, OPC_BR,           br),
+       OPC(0, OPC_JUMP,         jump),
+       OPC(0, OPC_CALL,         call),
+       OPC(0, OPC_RET,          ret),
+       OPC(0, OPC_KILL,         kill),
+       OPC(0, OPC_END,          end),
+       OPC(0, OPC_EMIT,         emit),
+       OPC(0, OPC_CUT,          cut),
+       OPC(0, OPC_CHMASK,       chmask),
+       OPC(0, OPC_CHSH,         chsh),
+       OPC(0, OPC_FLOW_REV,     flow_rev),
+
+       /* category 1: */
+       OPC(1, OPC_MOV, ),
+
+       /* category 2: */
+       OPC(2, OPC_ADD_F,        add.f),
+       OPC(2, OPC_MIN_F,        min.f),
+       OPC(2, OPC_MAX_F,        max.f),
+       OPC(2, OPC_MUL_F,        mul.f),
+       OPC(2, OPC_SIGN_F,       sign.f),
+       OPC(2, OPC_CMPS_F,       cmps.f),
+       OPC(2, OPC_ABSNEG_F,     absneg.f),
+       OPC(2, OPC_CMPV_F,       cmpv.f),
+       OPC(2, OPC_FLOOR_F,      floor.f),
+       OPC(2, OPC_CEIL_F,       ceil.f),
+       OPC(2, OPC_RNDNE_F,      rndne.f),
+       OPC(2, OPC_RNDAZ_F,      rndaz.f),
+       OPC(2, OPC_TRUNC_F,      trunc.f),
+       OPC(2, OPC_ADD_U,        add.u),
+       OPC(2, OPC_ADD_S,        add.s),
+       OPC(2, OPC_SUB_U,        sub.u),
+       OPC(2, OPC_SUB_S,        sub.s),
+       OPC(2, OPC_CMPS_U,       cmps.u),
+       OPC(2, OPC_CMPS_S,       cmps.s),
+       OPC(2, OPC_MIN_U,        min.u),
+       OPC(2, OPC_MIN_S,        min.s),
+       OPC(2, OPC_MAX_U,        max.u),
+       OPC(2, OPC_MAX_S,        max.s),
+       OPC(2, OPC_ABSNEG_S,     absneg.s),
+       OPC(2, OPC_AND_B,        and.b),
+       OPC(2, OPC_OR_B,         or.b),
+       OPC(2, OPC_NOT_B,        not.b),
+       OPC(2, OPC_XOR_B,        xor.b),
+       OPC(2, OPC_CMPV_U,       cmpv.u),
+       OPC(2, OPC_CMPV_S,       cmpv.s),
+       OPC(2, OPC_MUL_U,        mul.u),
+       OPC(2, OPC_MUL_S,        mul.s),
+       OPC(2, OPC_MULL_U,       mull.u),
+       OPC(2, OPC_BFREV_B,      bfrev.b),
+       OPC(2, OPC_CLZ_S,        clz.s),
+       OPC(2, OPC_CLZ_B,        clz.b),
+       OPC(2, OPC_SHL_B,        shl.b),
+       OPC(2, OPC_SHR_B,        shr.b),
+       OPC(2, OPC_ASHR_B,       ashr.b),
+       OPC(2, OPC_BARY_F,       bary.f),
+       OPC(2, OPC_MGEN_B,       mgen.b),
+       OPC(2, OPC_GETBIT_B,     getbit.b),
+       OPC(2, OPC_SETRM,        setrm),
+       OPC(2, OPC_CBITS_B,      cbits.b),
+       OPC(2, OPC_SHB,          shb),
+       OPC(2, OPC_MSAD,         msad),
+
+       /* category 3: */
+       OPC(3, OPC_MAD_U16,      mad.u16),
+       OPC(3, OPC_MADSH_U16,    madsh.u16),
+       OPC(3, OPC_MAD_S16,      mad.s16),
+       OPC(3, OPC_MADSH_M16,    madsh.m16),
+       OPC(3, OPC_MAD_U24,      mad.u24),
+       OPC(3, OPC_MAD_S24,      mad.s24),
+       OPC(3, OPC_MAD_F16,      mad.f16),
+       OPC(3, OPC_MAD_F32,      mad.f32),
+       OPC(3, OPC_SEL_B16,      sel.b16),
+       OPC(3, OPC_SEL_B32,      sel.b32),
+       OPC(3, OPC_SEL_S16,      sel.s16),
+       OPC(3, OPC_SEL_S32,      sel.s32),
+       OPC(3, OPC_SEL_F16,      sel.f16),
+       OPC(3, OPC_SEL_F32,      sel.f32),
+       OPC(3, OPC_SAD_S16,      sad.s16),
+       OPC(3, OPC_SAD_S32,      sad.s32),
+
+       /* category 4: */
+       OPC(4, OPC_RCP,          rcp),
+       OPC(4, OPC_RSQ,          rsq),
+       OPC(4, OPC_LOG2,         log2),
+       OPC(4, OPC_EXP2,         exp2),
+       OPC(4, OPC_SIN,          sin),
+       OPC(4, OPC_COS,          cos),
+       OPC(4, OPC_SQRT,         sqrt),
+
+       /* category 5: */
+       OPC(5, OPC_ISAM,         isam),
+       OPC(5, OPC_ISAML,        isaml),
+       OPC(5, OPC_ISAMM,        isamm),
+       OPC(5, OPC_SAM,          sam),
+       OPC(5, OPC_SAMB,         samb),
+       OPC(5, OPC_SAML,         saml),
+       OPC(5, OPC_SAMGQ,        samgq),
+       OPC(5, OPC_GETLOD,       getlod),
+       OPC(5, OPC_CONV,         conv),
+       OPC(5, OPC_CONVM,        convm),
+       OPC(5, OPC_GETSIZE,      getsize),
+       OPC(5, OPC_GETBUF,       getbuf),
+       OPC(5, OPC_GETPOS,       getpos),
+       OPC(5, OPC_GETINFO,      getinfo),
+       OPC(5, OPC_DSX,          dsx),
+       OPC(5, OPC_DSY,          dsy),
+       OPC(5, OPC_GATHER4R,     gather4r),
+       OPC(5, OPC_GATHER4G,     gather4g),
+       OPC(5, OPC_GATHER4B,     gather4b),
+       OPC(5, OPC_GATHER4A,     gather4a),
+       OPC(5, OPC_SAMGP0,       samgp0),
+       OPC(5, OPC_SAMGP1,       samgp1),
+       OPC(5, OPC_SAMGP2,       samgp2),
+       OPC(5, OPC_SAMGP3,       samgp3),
+       OPC(5, OPC_DSXPP_1,      dsxpp.1),
+       OPC(5, OPC_DSYPP_1,      dsypp.1),
+       OPC(5, OPC_RGETPOS,      rgetpos),
+       OPC(5, OPC_RGETINFO,     rgetinfo),
+
+
+       /* category 6: */
+       OPC(6, OPC_LDG,          ldg),
+       OPC(6, OPC_LDL,          ldl),
+       OPC(6, OPC_LDP,          ldp),
+       OPC(6, OPC_STG,          stg),
+       OPC(6, OPC_STL,          stl),
+       OPC(6, OPC_STP,          stp),
+       OPC(6, OPC_STI,          sti),
+       OPC(6, OPC_G2L,          g2l),
+       OPC(6, OPC_L2G,          l2g),
+       OPC(6, OPC_PREFETCH,     prefetch),
+       OPC(6, OPC_LDLW,         ldlw),
+       OPC(6, OPC_STLW,         stlw),
+       OPC(6, OPC_RESFMT,       resfmt),
+       OPC(6, OPC_RESINFO,      resinfo),
+       OPC(6, OPC_ATOMIC_ADD,     atomic.add),
+       OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
+       OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
+       OPC(6, OPC_ATOMIC_INC,     atomic.inc),
+       OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
+       OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
+       OPC(6, OPC_ATOMIC_MIN,     atomic.min),
+       OPC(6, OPC_ATOMIC_MAX,     atomic.max),
+       OPC(6, OPC_ATOMIC_AND,     atomic.and),
+       OPC(6, OPC_ATOMIC_OR,      atomic.or),
+       OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
+       OPC(6, OPC_LDGB,         ldgb),
+       OPC(6, OPC_STGB,         stgb),
+       OPC(6, OPC_STIB,         stib),
+       OPC(6, OPC_LDC,          ldc),
+       OPC(6, OPC_LDLV,         ldlv),
+
+       OPC(7, OPC_BAR,          bar),
+       OPC(7, OPC_FENCE,        fence),
+
+#undef OPC
+};
+
+#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
+
+// XXX hack.. probably should move this table somewhere common:
+#include "ir3.h"
+const char *ir3_instr_name(struct ir3_instruction *instr)
+{
+       if (opc_cat(instr->opc) == -1) return "??meta??";
+       return opcs[instr->opc].name;
+}
+
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
+{
+       instr_t *instr = (instr_t *)dwords;
+       uint32_t opc = instr_opc(instr);
+       const char *name;
+
+       if (debug & PRINT_VERBOSE)
+               fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
+
+       /* NOTE: order flags are printed is a bit fugly.. but for now I
+        * try to match the order in llvm-a3xx disassembler for easy
+        * diff'ing..
+        */
+
+       ctx->repeat = instr_repeat(instr);
+
+       if (instr->sync)
+               fprintf(ctx->out, "(sy)");
+       if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
+               fprintf(ctx->out, "(ss)");
+       if (instr->jmp_tgt)
+               fprintf(ctx->out, "(jp)");
+       if (instr_sat(instr))
+               fprintf(ctx->out, "(sat)");
+       if (ctx->repeat)
+               fprintf(ctx->out, "(rpt%d)", ctx->repeat);
+       if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
+               fprintf(ctx->out, "(ul)");
+
+       name = GETINFO(instr)->name;
+
+       if (name) {
+               fprintf(ctx->out, "%s", name);
+               GETINFO(instr)->print(ctx, instr);
+       } else {
+               fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
+       }
+
+       fprintf(ctx->out, "\n");
+
+       return (instr->opc_cat == 0) && (opc == OPC_END);
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
+{
+       struct disasm_ctx ctx;
+       int i;
+
+       assert((sizedwords % 2) == 0);
+
+       memset(&ctx, 0, sizeof(ctx));
+       ctx.out = out;
+       ctx.level = level;
+
+       for (i = 0; i < sizedwords; i += 2)
+               print_instr(&ctx, &dwords[i], i/2);
+
+       return 0;
+}
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h

new file mode 100644 (file)

index 0000000..7f60ee5
--- /dev/null
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INSTR_A3XX_H_
+#define INSTR_A3XX_H_
+
+#define PACKED __attribute__((__packed__))
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+
+/* size of largest OPC field of all the instruction categories: */
+#define NOPC_BITS 6
+
+#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
+
+typedef enum {
+       /* category 0: */
+       OPC_NOP             = _OPC(0, 0),
+       OPC_BR              = _OPC(0, 1),
+       OPC_JUMP            = _OPC(0, 2),
+       OPC_CALL            = _OPC(0, 3),
+       OPC_RET             = _OPC(0, 4),
+       OPC_KILL            = _OPC(0, 5),
+       OPC_END             = _OPC(0, 6),
+       OPC_EMIT            = _OPC(0, 7),
+       OPC_CUT             = _OPC(0, 8),
+       OPC_CHMASK          = _OPC(0, 9),
+       OPC_CHSH            = _OPC(0, 10),
+       OPC_FLOW_REV        = _OPC(0, 11),
+
+       /* category 1: */
+       OPC_MOV             = _OPC(1, 0),
+
+       /* category 2: */
+       OPC_ADD_F           = _OPC(2, 0),
+       OPC_MIN_F           = _OPC(2, 1),
+       OPC_MAX_F           = _OPC(2, 2),
+       OPC_MUL_F           = _OPC(2, 3),
+       OPC_SIGN_F          = _OPC(2, 4),
+       OPC_CMPS_F          = _OPC(2, 5),
+       OPC_ABSNEG_F        = _OPC(2, 6),
+       OPC_CMPV_F          = _OPC(2, 7),
+       /* 8 - invalid */
+       OPC_FLOOR_F         = _OPC(2, 9),
+       OPC_CEIL_F          = _OPC(2, 10),
+       OPC_RNDNE_F         = _OPC(2, 11),
+       OPC_RNDAZ_F         = _OPC(2, 12),
+       OPC_TRUNC_F         = _OPC(2, 13),
+       /* 14-15 - invalid */
+       OPC_ADD_U           = _OPC(2, 16),
+       OPC_ADD_S           = _OPC(2, 17),
+       OPC_SUB_U           = _OPC(2, 18),
+       OPC_SUB_S           = _OPC(2, 19),
+       OPC_CMPS_U          = _OPC(2, 20),
+       OPC_CMPS_S          = _OPC(2, 21),
+       OPC_MIN_U           = _OPC(2, 22),
+       OPC_MIN_S           = _OPC(2, 23),
+       OPC_MAX_U           = _OPC(2, 24),
+       OPC_MAX_S           = _OPC(2, 25),
+       OPC_ABSNEG_S        = _OPC(2, 26),
+       /* 27 - invalid */
+       OPC_AND_B           = _OPC(2, 28),
+       OPC_OR_B            = _OPC(2, 29),
+       OPC_NOT_B           = _OPC(2, 30),
+       OPC_XOR_B           = _OPC(2, 31),
+       /* 32 - invalid */
+       OPC_CMPV_U          = _OPC(2, 33),
+       OPC_CMPV_S          = _OPC(2, 34),
+       /* 35-47 - invalid */
+       OPC_MUL_U           = _OPC(2, 48),
+       OPC_MUL_S           = _OPC(2, 49),
+       OPC_MULL_U          = _OPC(2, 50),
+       OPC_BFREV_B         = _OPC(2, 51),
+       OPC_CLZ_S           = _OPC(2, 52),
+       OPC_CLZ_B           = _OPC(2, 53),
+       OPC_SHL_B           = _OPC(2, 54),
+       OPC_SHR_B           = _OPC(2, 55),
+       OPC_ASHR_B          = _OPC(2, 56),
+       OPC_BARY_F          = _OPC(2, 57),
+       OPC_MGEN_B          = _OPC(2, 58),
+       OPC_GETBIT_B        = _OPC(2, 59),
+       OPC_SETRM           = _OPC(2, 60),
+       OPC_CBITS_B         = _OPC(2, 61),
+       OPC_SHB             = _OPC(2, 62),
+       OPC_MSAD            = _OPC(2, 63),
+
+       /* category 3: */
+       OPC_MAD_U16         = _OPC(3, 0),
+       OPC_MADSH_U16       = _OPC(3, 1),
+       OPC_MAD_S16         = _OPC(3, 2),
+       OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
+       OPC_MAD_U24         = _OPC(3, 4),
+       OPC_MAD_S24         = _OPC(3, 5),
+       OPC_MAD_F16         = _OPC(3, 6),
+       OPC_MAD_F32         = _OPC(3, 7),
+       OPC_SEL_B16         = _OPC(3, 8),
+       OPC_SEL_B32         = _OPC(3, 9),
+       OPC_SEL_S16         = _OPC(3, 10),
+       OPC_SEL_S32         = _OPC(3, 11),
+       OPC_SEL_F16         = _OPC(3, 12),
+       OPC_SEL_F32         = _OPC(3, 13),
+       OPC_SAD_S16         = _OPC(3, 14),
+       OPC_SAD_S32         = _OPC(3, 15),
+
+       /* category 4: */
+       OPC_RCP             = _OPC(4, 0),
+       OPC_RSQ             = _OPC(4, 1),
+       OPC_LOG2            = _OPC(4, 2),
+       OPC_EXP2            = _OPC(4, 3),
+       OPC_SIN             = _OPC(4, 4),
+       OPC_COS             = _OPC(4, 5),
+       OPC_SQRT            = _OPC(4, 6),
+       // 7-63 - invalid
+
+       /* category 5: */
+       OPC_ISAM            = _OPC(5, 0),
+       OPC_ISAML           = _OPC(5, 1),
+       OPC_ISAMM           = _OPC(5, 2),
+       OPC_SAM             = _OPC(5, 3),
+       OPC_SAMB            = _OPC(5, 4),
+       OPC_SAML            = _OPC(5, 5),
+       OPC_SAMGQ           = _OPC(5, 6),
+       OPC_GETLOD          = _OPC(5, 7),
+       OPC_CONV            = _OPC(5, 8),
+       OPC_CONVM           = _OPC(5, 9),
+       OPC_GETSIZE         = _OPC(5, 10),
+       OPC_GETBUF          = _OPC(5, 11),
+       OPC_GETPOS          = _OPC(5, 12),
+       OPC_GETINFO         = _OPC(5, 13),
+       OPC_DSX             = _OPC(5, 14),
+       OPC_DSY             = _OPC(5, 15),
+       OPC_GATHER4R        = _OPC(5, 16),
+       OPC_GATHER4G        = _OPC(5, 17),
+       OPC_GATHER4B        = _OPC(5, 18),
+       OPC_GATHER4A        = _OPC(5, 19),
+       OPC_SAMGP0          = _OPC(5, 20),
+       OPC_SAMGP1          = _OPC(5, 21),
+       OPC_SAMGP2          = _OPC(5, 22),
+       OPC_SAMGP3          = _OPC(5, 23),
+       OPC_DSXPP_1         = _OPC(5, 24),
+       OPC_DSYPP_1         = _OPC(5, 25),
+       OPC_RGETPOS         = _OPC(5, 26),
+       OPC_RGETINFO        = _OPC(5, 27),
+
+       /* category 6: */
+       OPC_LDG             = _OPC(6, 0),        /* load-global */
+       OPC_LDL             = _OPC(6, 1),
+       OPC_LDP             = _OPC(6, 2),
+       OPC_STG             = _OPC(6, 3),        /* store-global */
+       OPC_STL             = _OPC(6, 4),
+       OPC_STP             = _OPC(6, 5),
+       OPC_STI             = _OPC(6, 6),
+       OPC_G2L             = _OPC(6, 7),
+       OPC_L2G             = _OPC(6, 8),
+       OPC_PREFETCH        = _OPC(6, 9),
+       OPC_LDLW            = _OPC(6, 10),
+       OPC_STLW            = _OPC(6, 11),
+       OPC_RESFMT          = _OPC(6, 14),
+       OPC_RESINFO         = _OPC(6, 15),
+       OPC_ATOMIC_ADD      = _OPC(6, 16),
+       OPC_ATOMIC_SUB      = _OPC(6, 17),
+       OPC_ATOMIC_XCHG     = _OPC(6, 18),
+       OPC_ATOMIC_INC      = _OPC(6, 19),
+       OPC_ATOMIC_DEC      = _OPC(6, 20),
+       OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
+       OPC_ATOMIC_MIN      = _OPC(6, 22),
+       OPC_ATOMIC_MAX      = _OPC(6, 23),
+       OPC_ATOMIC_AND      = _OPC(6, 24),
+       OPC_ATOMIC_OR       = _OPC(6, 25),
+       OPC_ATOMIC_XOR      = _OPC(6, 26),
+       OPC_LDGB            = _OPC(6, 27),
+       OPC_STGB            = _OPC(6, 28),
+       OPC_STIB            = _OPC(6, 29),
+       OPC_LDC             = _OPC(6, 30),
+       OPC_LDLV            = _OPC(6, 31),
+
+       /* category 7: */
+       OPC_BAR             = _OPC(7, 0),
+       OPC_FENCE           = _OPC(7, 1),
+
+       /* meta instructions (category -1): */
+       /* placeholder instr to mark shader inputs: */
+       OPC_META_INPUT      = _OPC(-1, 0),
+       /* The "fan-in" and "fan-out" instructions are used for keeping
+        * track of instructions that write to multiple dst registers
+        * (fan-out) like texture sample instructions, or read multiple
+        * consecutive scalar registers (fan-in) (bary.f, texture samp)
+        */
+       OPC_META_FO         = _OPC(-1, 2),
+       OPC_META_FI         = _OPC(-1, 3),
+
+} opc_t;
+
+#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
+#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
+
+typedef enum {
+       TYPE_F16 = 0,
+       TYPE_F32 = 1,
+       TYPE_U16 = 2,
+       TYPE_U32 = 3,
+       TYPE_S16 = 4,
+       TYPE_S32 = 5,
+       TYPE_U8  = 6,
+       TYPE_S8  = 7,  // XXX I assume?
+} type_t;
+
+static inline uint32_t type_size(type_t type)
+{
+       switch (type) {
+       case TYPE_F32:
+       case TYPE_U32:
+       case TYPE_S32:
+               return 32;
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return 16;
+       case TYPE_U8:
+       case TYPE_S8:
+               return 8;
+       default:
+               assert(0); /* invalid type */
+               return 0;
+       }
+}
+
+static inline int type_float(type_t type)
+{
+       return (type == TYPE_F32) || (type == TYPE_F16);
+}
+
+static inline int type_uint(type_t type)
+{
+       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
+}
+
+static inline int type_sint(type_t type)
+{
+       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
+}
+
+typedef union PACKED {
+       /* normal gpr or const src register: */
+       struct PACKED {
+               uint32_t comp  : 2;
+               uint32_t num   : 10;
+       };
+       /* for immediate val: */
+       int32_t  iim_val   : 11;
+       /* to make compiler happy: */
+       uint32_t dummy32;
+       uint32_t dummy10   : 10;
+       int32_t  idummy10  : 10;
+       uint32_t dummy11   : 11;
+       uint32_t dummy12   : 12;
+       uint32_t dummy13   : 13;
+       uint32_t dummy8    : 8;
+} reg_t;
+
+/* special registers: */
+#define REG_A0 61       /* address register */
+#define REG_P0 62       /* predicate register */
+
+static inline int reg_special(reg_t reg)
+{
+       return (reg.num == REG_A0) || (reg.num == REG_P0);
+}
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       int16_t  immed    : 16;
+                       uint32_t dummy1   : 16;
+               } a3xx;
+               struct PACKED {
+                       int32_t  immed    : 20;
+                       uint32_t dummy1   : 12;
+               } a4xx;
+               struct PACKED {
+                       int32_t immed     : 32;
+               } a5xx;
+       };
+
+       /* dword1: */
+       uint32_t dummy2   : 8;
+       uint32_t repeat   : 3;
+       uint32_t dummy3   : 1;
+       uint32_t ss       : 1;
+       uint32_t dummy4   : 7;
+       uint32_t inv      : 1;
+       uint32_t comp     : 2;
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat0_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* for normal src register: */
+               struct PACKED {
+                       uint32_t src : 11;
+                       /* at least low bit of pad must be zero or it will
+                        * look like a address relative src
+                        */
+                       uint32_t pad : 21;
+               };
+               /* for address relative: */
+               struct PACKED {
+                       int32_t  off : 10;
+                       uint32_t src_rel_c : 1;
+                       uint32_t src_rel : 1;
+                       uint32_t unknown : 20;
+               };
+               /* for immediate: */
+               int32_t  iim_val;
+               uint32_t uim_val;
+               float    fim_val;
+       };
+
+       /* dword1: */
+       uint32_t dst        : 8;
+       uint32_t repeat     : 3;
+       uint32_t src_r      : 1;
+       uint32_t ss         : 1;
+       uint32_t ul         : 1;
+       uint32_t dst_type   : 3;
+       uint32_t dst_rel    : 1;
+       uint32_t src_type   : 3;
+       uint32_t src_c      : 1;
+       uint32_t src_im     : 1;
+       uint32_t even       : 1;
+       uint32_t pos_inf    : 1;
+       uint32_t must_be_0  : 2;
+       uint32_t jmp_tgt    : 1;
+       uint32_t sync       : 1;
+       uint32_t opc_cat    : 3;
+} instr_cat1_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src1_im      : 1;   /* immediate */
+                       uint32_t src1_neg     : 1;   /* negate */
+                       uint32_t src1_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;   /* relative-const */
+                       uint32_t src1_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src2         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src2_im      : 1;   /* immediate */
+                       uint32_t src2_neg     : 1;   /* negate */
+                       uint32_t src2_abs     : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src2         : 10;
+                       uint32_t src2_c       : 1;   /* relative-const */
+                       uint32_t src2_rel     : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src2         : 12;
+                       uint32_t src2_c       : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src1_r   : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;   /* dunno */
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t ei       : 1;
+       uint32_t cond     : 3;
+       uint32_t src2_r   : 1;
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat2_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src1         : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src2_c       : 1;
+                       uint32_t src1_neg     : 1;
+                       uint32_t src2_r       : 1;
+               };
+               struct PACKED {
+                       uint32_t src1         : 10;
+                       uint32_t src1_c       : 1;
+                       uint32_t src1_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel1;
+               struct PACKED {
+                       uint32_t src1         : 12;
+                       uint32_t src1_c       : 1;
+                       uint32_t dummy        : 3;
+               } c1;
+       };
+
+       union PACKED {
+               struct PACKED {
+                       uint32_t src3         : 11;
+                       uint32_t must_be_zero2: 2;
+                       uint32_t src3_r       : 1;
+                       uint32_t src2_neg     : 1;
+                       uint32_t src3_neg     : 1;
+               };
+               struct PACKED {
+                       uint32_t src3         : 10;
+                       uint32_t src3_c       : 1;
+                       uint32_t src3_rel     : 1;
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel2;
+               struct PACKED {
+                       uint32_t src3         : 12;
+                       uint32_t src3_c       : 1;
+                       uint32_t dummy        : 3;
+               } c2;
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src1_r   : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t src2     : 8;
+       uint32_t opc      : 4;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat3_t;
+
+static inline bool instr_cat3_full(instr_cat3_t *cat3)
+{
+       switch (_OPC(3, cat3->opc)) {
+       case OPC_MAD_F16:
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_SEL_B16:
+       case OPC_SEL_S16:
+       case OPC_SEL_F16:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:  // really??
+               return false;
+       default:
+               return true;
+       }
+}
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               struct PACKED {
+                       uint32_t src          : 11;
+                       uint32_t must_be_zero1: 2;
+                       uint32_t src_im       : 1;   /* immediate */
+                       uint32_t src_neg      : 1;   /* negate */
+                       uint32_t src_abs      : 1;   /* absolute value */
+               };
+               struct PACKED {
+                       uint32_t src          : 10;
+                       uint32_t src_c        : 1;   /* relative-const */
+                       uint32_t src_rel      : 1;   /* relative address */
+                       uint32_t must_be_zero : 1;
+                       uint32_t dummy        : 3;
+               } rel;
+               struct PACKED {
+                       uint32_t src          : 12;
+                       uint32_t src_c        : 1;   /* const */
+                       uint32_t dummy        : 3;
+               } c;
+       };
+       uint32_t dummy1   : 16;  /* seem to be ignored */
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t repeat   : 2;
+       uint32_t sat      : 1;
+       uint32_t src_r    : 1;
+       uint32_t ss       : 1;
+       uint32_t ul       : 1;
+       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
+       uint32_t dummy2   : 5;   /* seem to be ignored */
+       uint32_t full     : 1;   /* not half */
+       uint32_t opc      : 6;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat4_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       union PACKED {
+               /* normal case: */
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 8;
+                       uint32_t dummy1   : 4;   /* seem to be ignored */
+                       uint32_t samp     : 4;
+                       uint32_t tex      : 7;
+               } norm;
+               /* s2en case: */
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t src2     : 11;
+                       uint32_t dummy1   : 1;
+                       uint32_t src3     : 8;
+                       uint32_t dummy2   : 3;
+               } s2en;
+               /* same in either case: */
+               // XXX I think, confirm this
+               struct PACKED {
+                       uint32_t full     : 1;   /* not half */
+                       uint32_t src1     : 8;
+                       uint32_t pad      : 23;
+               };
+       };
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t wrmask   : 4;   /* write-mask */
+       uint32_t type     : 3;
+       uint32_t dummy2   : 1;   /* seems to be ignored */
+       uint32_t is_3d    : 1;
+
+       uint32_t is_a     : 1;
+       uint32_t is_s     : 1;
+       uint32_t is_s2en  : 1;
+       uint32_t is_o     : 1;
+       uint32_t is_p     : 1;
+
+       uint32_t opc      : 5;
+       uint32_t jmp_tgt  : 1;
+       uint32_t sync     : 1;
+       uint32_t opc_cat  : 3;
+} instr_cat5_t;
+
+/* dword0 encoding for src_off: [src1 + off], src2: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe1  : 1;
+       int32_t  off      : 13;
+       uint32_t src1     : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dword1;
+} instr_cat6a_t;
+
+/* dword0 encoding for !src_off: [src1], src2 */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe0  : 1;
+       uint32_t src1     : 13;
+       uint32_t ignore0  : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dword1;
+} instr_cat6b_t;
+
+/* dword1 encoding for dst_off: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t dword0;
+
+       /* note: there is some weird stuff going on where sometimes
+        * cat6->a.off is involved.. but that seems like a bug in
+        * the blob, since it is used even if !cat6->src_off
+        * It would make sense for there to be some more bits to
+        * bring us to 11 bits worth of offset, but not sure..
+        */
+       int32_t off       : 8;
+       uint32_t mustbe1  : 1;
+       uint32_t dst      : 8;
+       uint32_t pad1     : 15;
+} instr_cat6c_t;
+
+/* dword1 encoding for !dst_off: */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t dword0;
+
+       uint32_t dst      : 8;
+       uint32_t mustbe0  : 1;
+       uint32_t idx      : 8;
+       uint32_t pad0     : 15;
+} instr_cat6d_t;
+
+/* ldgb and atomics..
+ *
+ * ldgb:      pad0=0, pad3=1
+ * atomic .g: pad0=1, pad3=1
+ *        .l: pad0=1, pad3=0
+ */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t pad0     : 1;
+       uint32_t src3     : 8;
+       uint32_t d        : 2;
+       uint32_t typed    : 1;
+       uint32_t type_size : 2;
+       uint32_t src1     : 8;
+       uint32_t src1_im  : 1;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t dst      : 8;
+       uint32_t mustbe0  : 1;
+       uint32_t src_ssbo : 8;
+       uint32_t pad2     : 3;  // type
+       uint32_t g        : 1;
+       uint32_t pad3     : 1;
+       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6ldgb_t;
+
+/* stgb, pad0=0, pad3=2
+ */
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t mustbe1  : 1;  // ???
+       uint32_t src1     : 8;
+       uint32_t d        : 2;
+       uint32_t typed    : 1;
+       uint32_t type_size : 2;
+       uint32_t pad0     : 9;
+       uint32_t src2_im  : 1;
+       uint32_t src2     : 8;
+
+       /* dword1: */
+       uint32_t src3     : 8;
+       uint32_t src3_im  : 1;
+       uint32_t dst_ssbo : 8;
+       uint32_t pad2     : 3;  // type
+       uint32_t pad3     : 2;
+       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
+} instr_cat6stgb_t;
+
+typedef union PACKED {
+       instr_cat6a_t a;
+       instr_cat6b_t b;
+       instr_cat6c_t c;
+       instr_cat6d_t d;
+       instr_cat6ldgb_t ldgb;
+       instr_cat6stgb_t stgb;
+       struct PACKED {
+               /* dword0: */
+               uint32_t src_off  : 1;
+               uint32_t pad1     : 31;
+
+               /* dword1: */
+               uint32_t pad2     : 8;
+               uint32_t dst_off  : 1;
+               uint32_t pad3     : 8;
+               uint32_t type     : 3;
+               uint32_t g        : 1;  /* or in some cases it means dst immed */
+               uint32_t pad4     : 1;
+               uint32_t opc      : 5;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+       };
+} instr_cat6_t;
+
+typedef struct PACKED {
+       /* dword0: */
+       uint32_t pad1     : 32;
+
+       /* dword1: */
+       uint32_t pad2     : 12;
+       uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
+       uint32_t pad3     : 6;
+       uint32_t w        : 1;  /* write */
+       uint32_t r        : 1;  /* read */
+       uint32_t l        : 1;  /* local */
+       uint32_t g        : 1;  /* global */
+       uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
+       uint32_t jmp_tgt  : 1;  /* (jp) */
+       uint32_t sync     : 1;  /* (sy) */
+       uint32_t opc_cat  : 3;
+} instr_cat7_t;
+
+typedef union PACKED {
+       instr_cat0_t cat0;
+       instr_cat1_t cat1;
+       instr_cat2_t cat2;
+       instr_cat3_t cat3;
+       instr_cat4_t cat4;
+       instr_cat5_t cat5;
+       instr_cat6_t cat6;
+       instr_cat7_t cat7;
+       struct PACKED {
+               /* dword0: */
+               uint32_t pad1     : 32;
+
+               /* dword1: */
+               uint32_t pad2     : 12;
+               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
+               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
+               uint32_t pad3     : 13;
+               uint32_t jmp_tgt  : 1;
+               uint32_t sync     : 1;
+               uint32_t opc_cat  : 3;
+
+       };
+} instr_t;
+
+static inline uint32_t instr_repeat(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 0:  return instr->cat0.repeat;
+       case 1:  return instr->cat1.repeat;
+       case 2:  return instr->cat2.repeat;
+       case 3:  return instr->cat3.repeat;
+       case 4:  return instr->cat4.repeat;
+       default: return 0;
+       }
+}
+
+static inline bool instr_sat(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 2:  return instr->cat2.sat;
+       case 3:  return instr->cat3.sat;
+       case 4:  return instr->cat4.sat;
+       default: return false;
+       }
+}
+
+static inline uint32_t instr_opc(instr_t *instr)
+{
+       switch (instr->opc_cat) {
+       case 0:  return instr->cat0.opc;
+       case 1:  return 0;
+       case 2:  return instr->cat2.opc;
+       case 3:  return instr->cat3.opc;
+       case 4:  return instr->cat4.opc;
+       case 5:  return instr->cat5.opc;
+       case 6:  return instr->cat6.opc;
+       case 7:  return instr->cat7.opc;
+       default: return 0;
+       }
+}
+
+static inline bool is_mad(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_MAD_U24:
+       case OPC_MAD_S24:
+       case OPC_MAD_F16:
+       case OPC_MAD_F32:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_madsh(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MADSH_U16:
+       case OPC_MADSH_M16:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_atomic(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ATOMIC_ADD:
+       case OPC_ATOMIC_SUB:
+       case OPC_ATOMIC_XCHG:
+       case OPC_ATOMIC_INC:
+       case OPC_ATOMIC_DEC:
+       case OPC_ATOMIC_CMPXCHG:
+       case OPC_ATOMIC_MIN:
+       case OPC_ATOMIC_MAX:
+       case OPC_ATOMIC_AND:
+       case OPC_ATOMIC_OR:
+       case OPC_ATOMIC_XOR:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_ssbo(opc_t opc)
+{
+       switch (opc) {
+       case OPC_RESFMT:
+       case OPC_RESINFO:
+       case OPC_LDGB:
+       case OPC_STGB:
+       case OPC_STIB:
+               return true;
+       default:
+               return false;
+       }
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
+
+#endif /* INSTR_A3XX_H_ */
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c

new file mode 100644 (file)

index 0000000..3d1c444
--- /dev/null
+++ b/src/freedreno/ir3/ir3.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <errno.h>
+
+#include "util/bitscan.h"
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "instr-a3xx.h"
+
+/* simple allocator to carve allocations out of an up-front allocated heap,
+ * so that we can free everything easily in one shot.
+ */
+void * ir3_alloc(struct ir3 *shader, int sz)
+{
+       return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+               unsigned nin, unsigned nout)
+{
+       struct ir3 *shader = rzalloc(compiler, struct ir3);
+
+       shader->compiler = compiler;
+       shader->ninputs = nin;
+       shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+       shader->noutputs = nout;
+       shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+       list_inithead(&shader->block_list);
+       list_inithead(&shader->array_list);
+
+       return shader;
+}
+
+void ir3_destroy(struct ir3 *shader)
+{
+       ralloc_free(shader);
+}
+
+#define iassert(cond) do { \
+       if (!(cond)) { \
+               debug_assert(cond); \
+               return -1; \
+       } } while (0)
+
+#define iassert_type(reg, full) do { \
+       if ((full)) { \
+               iassert(!((reg)->flags & IR3_REG_HALF)); \
+       } else { \
+               iassert((reg)->flags & IR3_REG_HALF); \
+       } } while (0);
+
+static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
+               uint32_t repeat, uint32_t valid_flags)
+{
+       reg_t val = { .dummy32 = 0 };
+
+       if (reg->flags & ~valid_flags) {
+               debug_printf("INVALID FLAGS: %x vs %x\n",
+                               reg->flags, valid_flags);
+       }
+
+       if (!(reg->flags & IR3_REG_R))
+               repeat = 0;
+
+       if (reg->flags & IR3_REG_IMMED) {
+               val.iim_val = reg->iim_val;
+       } else {
+               unsigned components;
+               int16_t max;
+
+               if (reg->flags & IR3_REG_RELATIV) {
+                       components = reg->size;
+                       val.idummy10 = reg->array.offset;
+                       max = (reg->array.offset + repeat + components - 1) >> 2;
+               } else {
+                       components = util_last_bit(reg->wrmask);
+                       val.comp = reg->num & 0x3;
+                       val.num  = reg->num >> 2;
+                       max = (reg->num + repeat + components - 1) >> 2;
+               }
+
+               if (reg->flags & IR3_REG_CONST) {
+                       info->max_const = MAX2(info->max_const, max);
+               } else if (val.num == 63) {
+                       /* ignore writes to dummy register r63.x */
+               } else if (max < 48) {
+                       if (reg->flags & IR3_REG_HALF) {
+                               if (info->gpu_id >= 600) {
+                                       /* starting w/ a6xx, half regs conflict with full regs: */
+                                       info->max_reg = MAX2(info->max_reg, (max+1)/2);
+                               } else {
+                                       info->max_half_reg = MAX2(info->max_half_reg, max);
+                               }
+                       } else {
+                               info->max_reg = MAX2(info->max_reg, max);
+                       }
+               }
+       }
+
+       return val.dummy32;
+}
+
+static int emit_cat0(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       instr_cat0_t *cat0 = ptr;
+
+       if (info->gpu_id >= 500) {
+               cat0->a5xx.immed = instr->cat0.immed;
+       } else if (info->gpu_id >= 400) {
+               cat0->a4xx.immed = instr->cat0.immed;
+       } else {
+               cat0->a3xx.immed = instr->cat0.immed;
+       }
+       cat0->repeat   = instr->repeat;
+       cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat0->inv      = instr->cat0.inv;
+       cat0->comp     = instr->cat0.comp;
+       cat0->opc      = instr->opc;
+       cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat0->opc_cat  = 0;
+
+       return 0;
+}
+
+static int emit_cat1(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src = instr->regs[1];
+       instr_cat1_t *cat1 = ptr;
+
+       iassert(instr->regs_count == 2);
+       iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
+       if (!(src->flags & IR3_REG_IMMED))
+               iassert_type(src, type_size(instr->cat1.src_type) == 32);
+
+       if (src->flags & IR3_REG_IMMED) {
+               cat1->iim_val = src->iim_val;
+               cat1->src_im  = 1;
+       } else if (src->flags & IR3_REG_RELATIV) {
+               cat1->off       = reg(src, info, instr->repeat,
+                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
+               cat1->src_rel   = 1;
+               cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
+       } else {
+               cat1->src  = reg(src, info, instr->repeat,
+                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
+               cat1->src_c     = !!(src->flags & IR3_REG_CONST);
+       }
+
+       cat1->dst      = reg(dst, info, instr->repeat,
+                       IR3_REG_RELATIV | IR3_REG_EVEN |
+                       IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
+       cat1->repeat   = instr->repeat;
+       cat1->src_r    = !!(src->flags & IR3_REG_R);
+       cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat1->dst_type = instr->cat1.dst_type;
+       cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
+       cat1->src_type = instr->cat1.src_type;
+       cat1->even     = !!(dst->flags & IR3_REG_EVEN);
+       cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
+       cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat1->opc_cat  = 1;
+
+       return 0;
+}
+
+static int emit_cat2(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       instr_cat2_t *cat2 = ptr;
+       unsigned absneg = ir3_cat2_absneg(instr->opc);
+
+       iassert((instr->regs_count == 2) || (instr->regs_count == 3));
+
+       if (src1->flags & IR3_REG_RELATIV) {
+               iassert(src1->array.offset < (1 << 10));
+               cat2->rel1.src1      = reg(src1, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
+               cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+               cat2->rel1.src1_rel  = 1;
+       } else if (src1->flags & IR3_REG_CONST) {
+               iassert(src1->num < (1 << 12));
+               cat2->c1.src1   = reg(src1, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+               cat2->c1.src1_c = 1;
+       } else {
+               iassert(src1->num < (1 << 11));
+               cat2->src1 = reg(src1, info, instr->repeat,
+                               IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+                               absneg);
+       }
+       cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
+       cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+       cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
+       cat2->src1_r   = !!(src1->flags & IR3_REG_R);
+
+       if (src2) {
+               iassert((src2->flags & IR3_REG_IMMED) ||
+                               !((src1->flags ^ src2->flags) & IR3_REG_HALF));
+
+               if (src2->flags & IR3_REG_RELATIV) {
+                       iassert(src2->array.offset < (1 << 10));
+                       cat2->rel2.src2      = reg(src2, info, instr->repeat,
+                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                                       IR3_REG_HALF | absneg);
+                       cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
+                       cat2->rel2.src2_rel  = 1;
+               } else if (src2->flags & IR3_REG_CONST) {
+                       iassert(src2->num < (1 << 12));
+                       cat2->c2.src2   = reg(src2, info, instr->repeat,
+                                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+                       cat2->c2.src2_c = 1;
+               } else {
+                       iassert(src2->num < (1 << 11));
+                       cat2->src2 = reg(src2, info, instr->repeat,
+                                       IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+                                       absneg);
+               }
+
+               cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
+               cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+               cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
+               cat2->src2_r   = !!(src2->flags & IR3_REG_R);
+       }
+
+       cat2->dst      = reg(dst, info, instr->repeat,
+                       IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
+       cat2->repeat   = instr->repeat;
+       cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
+       cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
+       cat2->ei       = !!(dst->flags & IR3_REG_EI);
+       cat2->cond     = instr->cat2.condition;
+       cat2->full     = ! (src1->flags & IR3_REG_HALF);
+       cat2->opc      = instr->opc;
+       cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat2->opc_cat  = 2;
+
+       return 0;
+}
+
+static int emit_cat3(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       struct ir3_register *src3 = instr->regs[3];
+       unsigned absneg = ir3_cat3_absneg(instr->opc);
+       instr_cat3_t *cat3 = ptr;
+       uint32_t src_flags = 0;
+
+       switch (instr->opc) {
+       case OPC_MAD_F16:
+       case OPC_MAD_U16:
+       case OPC_MAD_S16:
+       case OPC_SEL_B16:
+       case OPC_SEL_S16:
+       case OPC_SEL_F16:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:  // really??
+               src_flags |= IR3_REG_HALF;
+               break;
+       default:
+               break;
+       }
+
+       iassert(instr->regs_count == 4);
+       iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
+       iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
+       iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
+
+       if (src1->flags & IR3_REG_RELATIV) {
+               iassert(src1->array.offset < (1 << 10));
+               cat3->rel1.src1      = reg(src1, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
+               cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
+               cat3->rel1.src1_rel  = 1;
+       } else if (src1->flags & IR3_REG_CONST) {
+               iassert(src1->num < (1 << 12));
+               cat3->c1.src1   = reg(src1, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+               cat3->c1.src1_c = 1;
+       } else {
+               iassert(src1->num < (1 << 11));
+               cat3->src1 = reg(src1, info, instr->repeat,
+                               IR3_REG_R | IR3_REG_HALF | absneg);
+       }
+
+       cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+       cat3->src1_r   = !!(src1->flags & IR3_REG_R);
+
+       cat3->src2     = reg(src2, info, instr->repeat,
+                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
+       cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
+       cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+       cat3->src2_r   = !!(src2->flags & IR3_REG_R);
+
+
+       if (src3->flags & IR3_REG_RELATIV) {
+               iassert(src3->array.offset < (1 << 10));
+               cat3->rel2.src3      = reg(src3, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
+               cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
+               cat3->rel2.src3_rel  = 1;
+       } else if (src3->flags & IR3_REG_CONST) {
+               iassert(src3->num < (1 << 12));
+               cat3->c2.src3   = reg(src3, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
+               cat3->c2.src3_c = 1;
+       } else {
+               iassert(src3->num < (1 << 11));
+               cat3->src3 = reg(src3, info, instr->repeat,
+                               IR3_REG_R | IR3_REG_HALF | absneg);
+       }
+
+       cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+       cat3->src3_r   = !!(src3->flags & IR3_REG_R);
+
+       cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat3->repeat   = instr->repeat;
+       cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
+       cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
+       cat3->opc      = instr->opc;
+       cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat3->opc_cat  = 3;
+
+       return 0;
+}
+
+static int emit_cat4(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src = instr->regs[1];
+       instr_cat4_t *cat4 = ptr;
+
+       iassert(instr->regs_count == 2);
+
+       if (src->flags & IR3_REG_RELATIV) {
+               iassert(src->array.offset < (1 << 10));
+               cat4->rel.src      = reg(src, info, instr->repeat,
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
+                               IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
+               cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
+               cat4->rel.src_rel  = 1;
+       } else if (src->flags & IR3_REG_CONST) {
+               iassert(src->num < (1 << 12));
+               cat4->c.src   = reg(src, info, instr->repeat,
+                               IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
+                               IR3_REG_R | IR3_REG_HALF);
+               cat4->c.src_c = 1;
+       } else {
+               iassert(src->num < (1 << 11));
+               cat4->src = reg(src, info, instr->repeat,
+                               IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
+                               IR3_REG_R | IR3_REG_HALF);
+       }
+
+       cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
+       cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
+       cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
+       cat4->src_r    = !!(src->flags & IR3_REG_R);
+
+       cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat4->repeat   = instr->repeat;
+       cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
+       cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
+       cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
+       cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
+       cat4->full     = ! (src->flags & IR3_REG_HALF);
+       cat4->opc      = instr->opc;
+       cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat4->opc_cat  = 4;
+
+       return 0;
+}
+
+static int emit_cat5(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst = instr->regs[0];
+       struct ir3_register *src1 = instr->regs[1];
+       struct ir3_register *src2 = instr->regs[2];
+       struct ir3_register *src3 = instr->regs[3];
+       instr_cat5_t *cat5 = ptr;
+
+       iassert_type(dst, type_size(instr->cat5.type) == 32)
+
+       assume(src1 || !src2);
+       assume(src2 || !src3);
+
+       if (src1) {
+               cat5->full = ! (src1->flags & IR3_REG_HALF);
+               cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
+       }
+
+       if (instr->flags & IR3_INSTR_S2EN) {
+               if (src2) {
+                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+                       cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+               }
+               if (src3) {
+                       iassert(src3->flags & IR3_REG_HALF);
+                       cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
+               }
+               iassert(!(instr->cat5.samp | instr->cat5.tex));
+       } else {
+               iassert(!src3);
+               if (src2) {
+                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
+                       cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
+               }
+               cat5->norm.samp = instr->cat5.samp;
+               cat5->norm.tex  = instr->cat5.tex;
+       }
+
+       cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       cat5->wrmask   = dst->wrmask;
+       cat5->type     = instr->cat5.type;
+       cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
+       cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
+       cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
+       cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
+       cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
+       cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
+       cat5->opc      = instr->opc;
+       cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat5->opc_cat  = 5;
+
+       return 0;
+}
+
+static int emit_cat6(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       struct ir3_register *dst, *src1, *src2;
+       instr_cat6_t *cat6 = ptr;
+       bool type_full = type_size(instr->cat6.type) == 32;
+
+       cat6->type     = instr->cat6.type;
+       cat6->opc      = instr->opc;
+       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat6->g        = !!(instr->flags & IR3_INSTR_G);
+       cat6->opc_cat  = 6;
+
+       switch (instr->opc) {
+       case OPC_RESINFO:
+       case OPC_RESFMT:
+               iassert_type(instr->regs[0], type_full); /* dst */
+               iassert_type(instr->regs[1], type_full); /* src1 */
+               break;
+       case OPC_L2G:
+       case OPC_G2L:
+               iassert_type(instr->regs[0], true);      /* dst */
+               iassert_type(instr->regs[1], true);      /* src1 */
+               break;
+       case OPC_STG:
+       case OPC_STL:
+       case OPC_STP:
+       case OPC_STI:
+       case OPC_STLW:
+       case OPC_STIB:
+               /* no dst, so regs[0] is dummy */
+               iassert_type(instr->regs[1], true);      /* dst */
+               iassert_type(instr->regs[2], type_full); /* src1 */
+               iassert_type(instr->regs[3], true);      /* src2 */
+               break;
+       default:
+               iassert_type(instr->regs[0], type_full); /* dst */
+               iassert_type(instr->regs[1], true);      /* src1 */
+               if (instr->regs_count > 2)
+                       iassert_type(instr->regs[2], true);  /* src1 */
+               break;
+       }
+
+       /* the "dst" for a store instruction is (from the perspective
+        * of data flow in the shader, ie. register use/def, etc) in
+        * fact a register that is read by the instruction, rather
+        * than written:
+        */
+       if (is_store(instr)) {
+               iassert(instr->regs_count >= 3);
+
+               dst  = instr->regs[1];
+               src1 = instr->regs[2];
+               src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+       } else {
+               iassert(instr->regs_count >= 2);
+
+               dst  = instr->regs[0];
+               src1 = instr->regs[1];
+               src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
+       }
+
+       /* TODO we need a more comprehensive list about which instructions
+        * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+        * indicate to use the src_off encoding even if offset is zero
+        * (but then what to do about dst_off?)
+        */
+       if (is_atomic(instr->opc)) {
+               instr_cat6ldgb_t *ldgb = ptr;
+
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = false;
+
+               ldgb->d = instr->cat6.d - 1;
+               ldgb->typed = instr->cat6.typed;
+               ldgb->type_size = instr->cat6.iim_val - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               if (ldgb->g) {
+                       struct ir3_register *src3 = instr->regs[3];
+                       struct ir3_register *src4 = instr->regs[4];
+
+                       /* first src is src_ssbo: */
+                       iassert(src1->flags & IR3_REG_IMMED);
+                       ldgb->src_ssbo = src1->uim_val;
+
+                       ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+                       ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+                       ldgb->src3 = reg(src4, info, instr->repeat, 0);
+                       ldgb->pad0 = 0x1;
+                       ldgb->pad3 = 0x1;
+               } else {
+                       ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
+                       ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+                       ldgb->pad0 = 0x1;
+                       ldgb->pad3 = 0x0;
+               }
+
+               return 0;
+       } else if (instr->opc == OPC_LDGB) {
+               struct ir3_register *src3 = instr->regs[3];
+               instr_cat6ldgb_t *ldgb = ptr;
+
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = false;
+
+               ldgb->d = instr->cat6.d - 1;
+               ldgb->typed = instr->cat6.typed;
+               ldgb->type_size = instr->cat6.iim_val - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               /* first src is src_ssbo: */
+               iassert(src1->flags & IR3_REG_IMMED);
+               ldgb->src_ssbo = src1->uim_val;
+
+               /* then next two are src1/src2: */
+               ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+               ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+               ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+               ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+               ldgb->pad0 = 0x0;
+               ldgb->pad3 = 0x1;
+
+               return 0;
+       } else if (instr->opc == OPC_RESINFO) {
+               instr_cat6ldgb_t *ldgb = ptr;
+
+               ldgb->d = instr->cat6.d - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               /* first src is src_ssbo: */
+               iassert(src1->flags & IR3_REG_IMMED);
+               ldgb->src_ssbo = src1->uim_val;
+
+               return 0;
+       } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
+               struct ir3_register *src3 = instr->regs[4];
+               instr_cat6stgb_t *stgb = ptr;
+
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = true;
+               stgb->pad3 = 0x2;
+
+               stgb->d = instr->cat6.d - 1;
+               stgb->typed = instr->cat6.typed;
+               stgb->type_size = instr->cat6.iim_val - 1;
+
+               /* first src is dst_ssbo: */
+               iassert(dst->flags & IR3_REG_IMMED);
+               stgb->dst_ssbo = dst->uim_val;
+
+               /* then src1/src2/src3: */
+               stgb->src1 = reg(src1, info, instr->repeat, 0);
+               stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+               stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+               stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+               return 0;
+       } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
+                       (instr->opc == OPC_LDL)) {
+               instr_cat6a_t *cat6a = ptr;
+
+               cat6->src_off = true;
+
+               cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+               cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
+               if (src2) {
+                       cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               }
+               cat6a->off = instr->cat6.src_offset;
+       } else {
+               instr_cat6b_t *cat6b = ptr;
+
+               cat6->src_off = false;
+
+               cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
+               cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
+               if (src2) {
+                       cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               }
+       }
+
+       if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
+                       (instr->opc == OPC_STL)) {
+               instr_cat6c_t *cat6c = ptr;
+               cat6->dst_off = true;
+               cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+               cat6c->off = instr->cat6.dst_offset;
+       } else {
+               instr_cat6d_t *cat6d = ptr;
+               cat6->dst_off = false;
+               cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+       }
+
+       return 0;
+}
+
+static int emit_cat7(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       instr_cat7_t *cat7 = ptr;
+
+       cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
+       cat7->w       = instr->cat7.w;
+       cat7->r       = instr->cat7.r;
+       cat7->l       = instr->cat7.l;
+       cat7->g       = instr->cat7.g;
+       cat7->opc     = instr->opc;
+       cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+       cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
+       cat7->opc_cat = 7;
+
+       return 0;
+}
+
+static int (*emit[])(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info) = {
+       emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+       emit_cat7,
+};
+
+void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
+               uint32_t gpu_id)
+{
+       uint32_t *ptr, *dwords;
+
+       info->gpu_id        = gpu_id;
+       info->max_reg       = -1;
+       info->max_half_reg  = -1;
+       info->max_const     = -1;
+       info->instrs_count  = 0;
+       info->sizedwords    = 0;
+       info->ss = info->sy = 0;
+
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       info->sizedwords += 2;
+               }
+       }
+
+       /* need an integer number of instruction "groups" (sets of 16
+        * instructions on a4xx or sets of 4 instructions on a3xx),
+        * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
+        */
+       if (gpu_id >= 400) {
+               info->sizedwords = align(info->sizedwords, 16 * 2);
+       } else {
+               info->sizedwords = align(info->sizedwords, 4 * 2);
+       }
+
+       ptr = dwords = calloc(4, info->sizedwords);
+
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
+                       if (ret)
+                               goto fail;
+                       info->instrs_count += 1 + instr->repeat;
+                       dwords += 2;
+
+                       if (instr->flags & IR3_INSTR_SS)
+                               info->ss++;
+
+                       if (instr->flags & IR3_INSTR_SY)
+                               info->sy++;
+               }
+       }
+
+       return ptr;
+
+fail:
+       free(ptr);
+       return NULL;
+}
+
+static struct ir3_register * reg_create(struct ir3 *shader,
+               int num, int flags)
+{
+       struct ir3_register *reg =
+                       ir3_alloc(shader, sizeof(struct ir3_register));
+       reg->wrmask = 1;
+       reg->flags = flags;
+       reg->num = num;
+       return reg;
+}
+
+static void insert_instr(struct ir3_block *block,
+               struct ir3_instruction *instr)
+{
+       struct ir3 *shader = block->shader;
+#ifdef DEBUG
+       instr->serialno = ++shader->instr_count;
+#endif
+       list_addtail(&instr->node, &block->instr_list);
+
+       if (is_input(instr))
+               array_insert(shader, shader->baryfs, instr);
+}
+
+struct ir3_block * ir3_block_create(struct ir3 *shader)
+{
+       struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+       block->serialno = ++shader->block_count;
+#endif
+       block->shader = shader;
+       list_inithead(&block->node);
+       list_inithead(&block->instr_list);
+       return block;
+}
+
+static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
+{
+       struct ir3_instruction *instr;
+       unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
+       char *ptr = ir3_alloc(block->shader, sz);
+
+       instr = (struct ir3_instruction *)ptr;
+       ptr  += sizeof(*instr);
+       instr->regs = (struct ir3_register **)ptr;
+
+#ifdef DEBUG
+       instr->regs_max = nreg;
+#endif
+
+       return instr;
+}
+
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+               opc_t opc, int nreg)
+{
+       struct ir3_instruction *instr = instr_create(block, nreg);
+       instr->block = block;
+       instr->opc = opc;
+       insert_instr(block, instr);
+       return instr;
+}
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
+{
+       /* NOTE: we could be slightly more clever, at least for non-meta,
+        * and choose # of regs based on category.
+        */
+       return ir3_instr_create2(block, opc, 4);
+}
+
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *new_instr = instr_create(instr->block,
+                       instr->regs_count);
+       struct ir3_register **regs;
+       unsigned i;
+
+       regs = new_instr->regs;
+       *new_instr = *instr;
+       new_instr->regs = regs;
+
+       insert_instr(instr->block, new_instr);
+
+       /* clone registers: */
+       new_instr->regs_count = 0;
+       for (i = 0; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               struct ir3_register *new_reg =
+                               ir3_reg_create(new_instr, reg->num, reg->flags);
+               *new_reg = *reg;
+       }
+
+       return new_instr;
+}
+
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+       array_insert(instr, instr->deps, dep);
+}
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+               int num, int flags)
+{
+       struct ir3 *shader = instr->block->shader;
+       struct ir3_register *reg = reg_create(shader, num, flags);
+#ifdef DEBUG
+       debug_assert(instr->regs_count < instr->regs_max);
+#endif
+       instr->regs[instr->regs_count++] = reg;
+       return reg;
+}
+
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+               struct ir3_register *reg)
+{
+       struct ir3_register *new_reg = reg_create(shader, 0, 0);
+       *new_reg = *reg;
+       return new_reg;
+}
+
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+               struct ir3_instruction *addr)
+{
+       if (instr->address != addr) {
+               struct ir3 *ir = instr->block->shader;
+               instr->address = addr;
+               array_insert(ir, ir->indirects, instr);
+       }
+}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+               instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ir3_block_clear_mark(block);
+       }
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+unsigned
+ir3_count_instructions(struct ir3 *ir)
+{
+       unsigned cnt = 0;
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       instr->ip = cnt++;
+               }
+               block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+               block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+       }
+       return cnt;
+}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+               if (arr->id == id)
+                       return arr;
+       return NULL;
+}
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h

new file mode 100644 (file)

index 0000000..ea32188
--- /dev/null
+++ b/src/freedreno/ir3/ir3.h
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IR3_H_
+#define IR3_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "compiler/shader_enums.h"
+
+#include "util/u_debug.h"
+#include "util/list.h"
+
+#include "instr-a3xx.h"
+
+/* low level intermediate representation of an adreno shader program */
+
+struct ir3_compiler;
+struct ir3;
+struct ir3_instruction;
+struct ir3_block;
+
+struct ir3_info {
+       uint32_t gpu_id;
+       uint16_t sizedwords;
+       uint16_t instrs_count;   /* expanded to account for rpt's */
+       /* NOTE: max_reg, etc, does not include registers not touched
+        * by the shader (ie. vertex fetched via VFD_DECODE but not
+        * touched by shader)
+        */
+       int8_t   max_reg;   /* highest GPR # used by shader */
+       int8_t   max_half_reg;
+       int16_t  max_const;
+
+       /* number of sync bits: */
+       uint16_t ss, sy;
+};
+
+struct ir3_register {
+       enum {
+               IR3_REG_CONST  = 0x001,
+               IR3_REG_IMMED  = 0x002,
+               IR3_REG_HALF   = 0x004,
+               /* high registers are used for some things in compute shaders,
+                * for example.  Seems to be for things that are global to all
+                * threads in a wave, so possibly these are global/shared by
+                * all the threads in the wave?
+                */
+               IR3_REG_HIGH   = 0x008,
+               IR3_REG_RELATIV= 0x010,
+               IR3_REG_R      = 0x020,
+               /* Most instructions, it seems, can do float abs/neg but not
+                * integer.  The CP pass needs to know what is intended (int or
+                * float) in order to do the right thing.  For this reason the
+                * abs/neg flags are split out into float and int variants.  In
+                * addition, .b (bitwise) operations, the negate is actually a
+                * bitwise not, so split that out into a new flag to make it
+                * more clear.
+                */
+               IR3_REG_FNEG   = 0x040,
+               IR3_REG_FABS   = 0x080,
+               IR3_REG_SNEG   = 0x100,
+               IR3_REG_SABS   = 0x200,
+               IR3_REG_BNOT   = 0x400,
+               IR3_REG_EVEN   = 0x800,
+               IR3_REG_POS_INF= 0x1000,
+               /* (ei) flag, end-input?  Set on last bary, presumably to signal
+                * that the shader needs no more input:
+                */
+               IR3_REG_EI     = 0x2000,
+               /* meta-flags, for intermediate stages of IR, ie.
+                * before register assignment is done:
+                */
+               IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
+               IR3_REG_ARRAY  = 0x8000,
+
+       } flags;
+
+       /* normal registers:
+        * the component is in the low two bits of the reg #, so
+        * rN.x becomes: (N << 2) | x
+        */
+       int   num;
+       union {
+               /* immediate: */
+               int32_t  iim_val;
+               uint32_t uim_val;
+               float    fim_val;
+               /* relative: */
+               struct {
+                       uint16_t id;
+                       int16_t offset;
+               } array;
+       };
+
+       /* For IR3_REG_SSA, src registers contain ptr back to assigning
+        * instruction.
+        *
+        * For IR3_REG_ARRAY, the pointer is back to the last dependent
+        * array access (although the net effect is the same, it points
+        * back to a previous instruction that we depend on).
+        */
+       struct ir3_instruction *instr;
+
+       union {
+               /* used for cat5 instructions, but also for internal/IR level
+                * tracking of what registers are read/written by an instruction.
+                * wrmask may be a bad name since it is used to represent both
+                * src and dst that touch multiple adjacent registers.
+                */
+               unsigned wrmask;
+               /* for relative addressing, 32bits for array size is too small,
+                * but otoh we don't need to deal with disjoint sets, so instead
+                * use a simple size field (number of scalar components).
+                */
+               unsigned size;
+       };
+};
+
+/*
+ * Stupid/simple growable array implementation:
+ */
+#define DECLARE_ARRAY(type, name) \
+       unsigned name ## _count, name ## _sz; \
+       type * name;
+
+#define array_insert(ctx, arr, val) do { \
+               if (arr ## _count == arr ## _sz) { \
+                       arr ## _sz = MAX2(2 * arr ## _sz, 16); \
+                       arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
+               } \
+               arr[arr ##_count++] = val; \
+       } while (0)
+
+struct ir3_instruction {
+       struct ir3_block *block;
+       opc_t opc;
+       enum {
+               /* (sy) flag is set on first instruction, and after sample
+                * instructions (probably just on RAW hazard).
+                */
+               IR3_INSTR_SY    = 0x001,
+               /* (ss) flag is set on first instruction, and first instruction
+                * to depend on the result of "long" instructions (RAW hazard):
+                *
+                *   rcp, rsq, log2, exp2, sin, cos, sqrt
+                *
+                * It seems to synchronize until all in-flight instructions are
+                * completed, for example:
+                *
+                *   rsq hr1.w, hr1.w
+                *   add.f hr2.z, (neg)hr2.z, hc0.y
+                *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
+                *   rsq hr2.x, hr2.x
+                *   (rpt1)nop
+                *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
+                *   nop
+                *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
+                *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
+                *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
+                *
+                * The last mul.f does not have (ss) set, presumably because the
+                * (ss) on the previous instruction does the job.
+                *
+                * The blob driver also seems to set it on WAR hazards, although
+                * not really clear if this is needed or just blob compiler being
+                * sloppy.  So far I haven't found a case where removing the (ss)
+                * causes problems for WAR hazard, but I could just be getting
+                * lucky:
+                *
+                *   rcp r1.y, r3.y
+                *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
+                *
+                */
+               IR3_INSTR_SS    = 0x002,
+               /* (jp) flag is set on jump targets:
+                */
+               IR3_INSTR_JP    = 0x004,
+               IR3_INSTR_UL    = 0x008,
+               IR3_INSTR_3D    = 0x010,
+               IR3_INSTR_A     = 0x020,
+               IR3_INSTR_O     = 0x040,
+               IR3_INSTR_P     = 0x080,
+               IR3_INSTR_S     = 0x100,
+               IR3_INSTR_S2EN  = 0x200,
+               IR3_INSTR_G     = 0x400,
+               IR3_INSTR_SAT   = 0x800,
+               /* meta-flags, for intermediate stages of IR, ie.
+                * before register assignment is done:
+                */
+               IR3_INSTR_MARK  = 0x1000,
+               IR3_INSTR_UNUSED= 0x2000,
+       } flags;
+       int repeat;
+#ifdef DEBUG
+       unsigned regs_max;
+#endif
+       unsigned regs_count;
+       struct ir3_register **regs;
+       union {
+               struct {
+                       char inv;
+                       char comp;
+                       int  immed;
+                       struct ir3_block *target;
+               } cat0;
+               struct {
+                       type_t src_type, dst_type;
+               } cat1;
+               struct {
+                       enum {
+                               IR3_COND_LT = 0,
+                               IR3_COND_LE = 1,
+                               IR3_COND_GT = 2,
+                               IR3_COND_GE = 3,
+                               IR3_COND_EQ = 4,
+                               IR3_COND_NE = 5,
+                       } condition;
+               } cat2;
+               struct {
+                       unsigned samp, tex;
+                       type_t type;
+               } cat5;
+               struct {
+                       type_t type;
+                       int src_offset;
+                       int dst_offset;
+                       int iim_val : 3;      /* for ldgb/stgb, # of components */
+                       int d : 3;
+                       bool typed : 1;
+               } cat6;
+               struct {
+                       unsigned w : 1;       /* write */
+                       unsigned r : 1;       /* read */
+                       unsigned l : 1;       /* local */
+                       unsigned g : 1;       /* global */
+               } cat7;
+               /* for meta-instructions, just used to hold extra data
+                * before instruction scheduling, etc
+                */
+               struct {
+                       int off;              /* component/offset */
+               } fo;
+               struct {
+                       struct ir3_block *block;
+               } inout;
+       };
+
+       /* transient values used during various algorithms: */
+       union {
+               /* The instruction depth is the max dependency distance to output.
+                *
+                * You can also think of it as the "cost", if we did any sort of
+                * optimization for register footprint.  Ie. a value that is  just
+                * result of moving a const to a reg would have a low cost,  so to
+                * it could make sense to duplicate the instruction at various
+                * points where the result is needed to reduce register footprint.
+                */
+               unsigned depth;
+               /* When we get to the RA stage, we no longer need depth, but
+                * we do need instruction's position/name:
+                */
+               struct {
+                       uint16_t ip;
+                       uint16_t name;
+               };
+       };
+
+       /* used for per-pass extra instruction data.
+        */
+       void *data;
+
+       /* Used during CP and RA stages.  For fanin and shader inputs/
+        * outputs where we need a sequence of consecutive registers,
+        * keep track of each src instructions left (ie 'n-1') and right
+        * (ie 'n+1') neighbor.  The front-end must insert enough mov's
+        * to ensure that each instruction has at most one left and at
+        * most one right neighbor.  During the copy-propagation pass,
+        * we only remove mov's when we can preserve this constraint.
+        * And during the RA stage, we use the neighbor information to
+        * allocate a block of registers in one shot.
+        *
+        * TODO: maybe just add something like:
+        *   struct ir3_instruction_ref {
+        *       struct ir3_instruction *instr;
+        *       unsigned cnt;
+        *   }
+        *
+        * Or can we get away without the refcnt stuff?  It seems like
+        * it should be overkill..  the problem is if, potentially after
+        * already eliminating some mov's, if you have a single mov that
+        * needs to be grouped with it's neighbors in two different
+        * places (ex. shader output and a fanin).
+        */
+       struct {
+               struct ir3_instruction *left, *right;
+               uint16_t left_cnt, right_cnt;
+       } cp;
+
+       /* an instruction can reference at most one address register amongst
+        * it's src/dst registers.  Beyond that, you need to insert mov's.
+        *
+        * NOTE: do not write this directly, use ir3_instr_set_address()
+        */
+       struct ir3_instruction *address;
+
+       /* Tracking for additional dependent instructions.  Used to handle
+        * barriers, WAR hazards for arrays/SSBOs/etc.
+        */
+       DECLARE_ARRAY(struct ir3_instruction *, deps);
+
+       /*
+        * From PoV of instruction scheduling, not execution (ie. ignores global/
+        * local distinction):
+        *                            shared  image  atomic  SSBO  everything
+        *   barrier()/            -   R/W     R/W    R/W     R/W       X
+        *     groupMemoryBarrier()
+        *   memoryBarrier()       -           R/W    R/W
+        *     (but only images declared coherent?)
+        *   memoryBarrierAtomic() -                  R/W
+        *   memoryBarrierBuffer() -                          R/W
+        *   memoryBarrierImage()  -           R/W
+        *   memoryBarrierShared() -   R/W
+        *
+        * TODO I think for SSBO/image/shared, in cases where we can determine
+        * which variable is accessed, we don't need to care about accesses to
+        * different variables (unless declared coherent??)
+        */
+       enum {
+               IR3_BARRIER_EVERYTHING = 1 << 0,
+               IR3_BARRIER_SHARED_R   = 1 << 1,
+               IR3_BARRIER_SHARED_W   = 1 << 2,
+               IR3_BARRIER_IMAGE_R    = 1 << 3,
+               IR3_BARRIER_IMAGE_W    = 1 << 4,
+               IR3_BARRIER_BUFFER_R   = 1 << 5,
+               IR3_BARRIER_BUFFER_W   = 1 << 6,
+               IR3_BARRIER_ARRAY_R    = 1 << 7,
+               IR3_BARRIER_ARRAY_W    = 1 << 8,
+       } barrier_class, barrier_conflict;
+
+       /* Entry in ir3_block's instruction list: */
+       struct list_head node;
+
+       int use_count;      /* currently just updated/used by cp */
+
+#ifdef DEBUG
+       uint32_t serialno;
+#endif
+};
+
+static inline struct ir3_instruction *
+ir3_neighbor_first(struct ir3_instruction *instr)
+{
+       int cnt = 0;
+       while (instr->cp.left) {
+               instr = instr->cp.left;
+               if (++cnt > 0xffff) {
+                       debug_assert(0);
+                       break;
+               }
+       }
+       return instr;
+}
+
+static inline int ir3_neighbor_count(struct ir3_instruction *instr)
+{
+       int num = 1;
+
+       debug_assert(!instr->cp.left);
+
+       while (instr->cp.right) {
+               num++;
+               instr = instr->cp.right;
+               if (num > 0xffff) {
+                       debug_assert(0);
+                       break;
+               }
+       }
+
+       return num;
+}
+
+struct ir3 {
+       struct ir3_compiler *compiler;
+
+       unsigned ninputs, noutputs;
+       struct ir3_instruction **inputs;
+       struct ir3_instruction **outputs;
+
+       /* Track bary.f (and ldlv) instructions.. this is needed in
+        * scheduling to ensure that all varying fetches happen before
+        * any potential kill instructions.  The hw gets grumpy if all
+        * threads in a group are killed before the last bary.f gets
+        * a chance to signal end of input (ei).
+        */
+       DECLARE_ARRAY(struct ir3_instruction *, baryfs);
+
+       /* Track all indirect instructions (read and write).  To avoid
+        * deadlock scenario where an address register gets scheduled,
+        * but other dependent src instructions cannot be scheduled due
+        * to dependency on a *different* address register value, the
+        * scheduler needs to ensure that all dependencies other than
+        * the instruction other than the address register are scheduled
+        * before the one that writes the address register.  Having a
+        * convenient list of instructions that reference some address
+        * register simplifies this.
+        */
+       DECLARE_ARRAY(struct ir3_instruction *, indirects);
+
+       /* and same for instructions that consume predicate register: */
+       DECLARE_ARRAY(struct ir3_instruction *, predicates);
+
+       /* Track texture sample instructions which need texture state
+        * patched in (for astc-srgb workaround):
+        */
+       DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
+
+       /* List of blocks: */
+       struct list_head block_list;
+
+       /* List of ir3_array's: */
+       struct list_head array_list;
+
+#ifdef DEBUG
+       unsigned block_count, instr_count;
+#endif
+};
+
+struct ir3_array {
+       struct list_head node;
+       unsigned length;
+       unsigned id;
+
+       struct nir_register *r;
+
+       /* To avoid array write's from getting DCE'd, keep track of the
+        * most recent write.  Any array access depends on the most
+        * recent write.  This way, nothing depends on writes after the
+        * last read.  But all the writes that happen before that have
+        * something depending on them
+        */
+       struct ir3_instruction *last_write;
+
+       /* extra stuff used in RA pass: */
+       unsigned base;      /* base vreg name */
+       unsigned reg;       /* base physical reg */
+       uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
+struct ir3_block {
+       struct list_head node;
+       struct ir3 *shader;
+
+       const struct nir_block *nblock;
+
+       struct list_head instr_list;  /* list of ir3_instruction */
+
+       /* each block has either one or two successors.. in case of
+        * two successors, 'condition' decides which one to follow.
+        * A block preceding an if/else has two successors.
+        */
+       struct ir3_instruction *condition;
+       struct ir3_block *successors[2];
+
+       unsigned predecessors_count;
+       struct ir3_block **predecessors;
+
+       uint16_t start_ip, end_ip;
+
+       /* Track instructions which do not write a register but other-
+        * wise must not be discarded (such as kill, stg, etc)
+        */
+       DECLARE_ARRAY(struct ir3_instruction *, keeps);
+
+       /* used for per-pass extra block data.  Mainly used right
+        * now in RA step to track livein/liveout.
+        */
+       void *data;
+
+#ifdef DEBUG
+       uint32_t serialno;
+#endif
+};
+
+static inline uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+       return block->serialno;
+#else
+       return (uint32_t)(unsigned long)block;
+#endif
+}
+
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+               unsigned nin, unsigned nout);
+void ir3_destroy(struct ir3 *shader);
+void * ir3_assemble(struct ir3 *shader,
+               struct ir3_info *info, uint32_t gpu_id);
+void * ir3_alloc(struct ir3 *shader, int sz);
+
+struct ir3_block * ir3_block_create(struct ir3 *shader);
+
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+               opc_t opc, int nreg);
+struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
+const char *ir3_instr_name(struct ir3_instruction *instr);
+
+struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
+               int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+               struct ir3_register *reg);
+
+void ir3_instr_set_address(struct ir3_instruction *instr,
+               struct ir3_instruction *addr);
+
+static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
+{
+       if (instr->flags & IR3_INSTR_MARK)
+               return true;  /* already visited */
+       instr->flags |= IR3_INSTR_MARK;
+       return false;
+}
+
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+unsigned ir3_count_instructions(struct ir3 *ir);
+
+static inline int ir3_instr_regno(struct ir3_instruction *instr,
+               struct ir3_register *reg)
+{
+       unsigned i;
+       for (i = 0; i < instr->regs_count; i++)
+               if (reg == instr->regs[i])
+                       return i;
+       return -1;
+}
+
+
+#define MAX_ARRAYS 16
+
+/* comp:
+ *   0 - x
+ *   1 - y
+ *   2 - z
+ *   3 - w
+ */
+static inline uint32_t regid(int num, int comp)
+{
+       return (num << 2) | (comp & 0x3);
+}
+
+static inline uint32_t reg_num(struct ir3_register *reg)
+{
+       return reg->num >> 2;
+}
+
+static inline uint32_t reg_comp(struct ir3_register *reg)
+{
+       return reg->num & 0x3;
+}
+
+static inline bool is_flow(struct ir3_instruction *instr)
+{
+       return (opc_cat(instr->opc) == 0);
+}
+
+static inline bool is_kill(struct ir3_instruction *instr)
+{
+       return instr->opc == OPC_KILL;
+}
+
+static inline bool is_nop(struct ir3_instruction *instr)
+{
+       return instr->opc == OPC_NOP;
+}
+
+/* Is it a non-transformative (ie. not type changing) mov?  This can
+ * also include absneg.s/absneg.f, which for the most part can be
+ * treated as a mov (single src argument).
+ */
+static inline bool is_same_type_mov(struct ir3_instruction *instr)
+{
+       struct ir3_register *dst;
+
+       switch (instr->opc) {
+       case OPC_MOV:
+               if (instr->cat1.src_type != instr->cat1.dst_type)
+                       return false;
+               break;
+       case OPC_ABSNEG_F:
+       case OPC_ABSNEG_S:
+               if (instr->flags & IR3_INSTR_SAT)
+                       return false;
+               break;
+       default:
+               return false;
+       }
+
+       dst = instr->regs[0];
+
+       /* mov's that write to a0.x or p0.x are special: */
+       if (dst->num == regid(REG_P0, 0))
+               return false;
+       if (dst->num == regid(REG_A0, 0))
+               return false;
+
+       if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+               return false;
+
+       return true;
+}
+
+static inline bool is_alu(struct ir3_instruction *instr)
+{
+       return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
+}
+
+static inline bool is_sfu(struct ir3_instruction *instr)
+{
+       return (opc_cat(instr->opc) == 4);
+}
+
+static inline bool is_tex(struct ir3_instruction *instr)
+{
+       return (opc_cat(instr->opc) == 5);
+}
+
+static inline bool is_mem(struct ir3_instruction *instr)
+{
+       return (opc_cat(instr->opc) == 6);
+}
+
+static inline bool is_barrier(struct ir3_instruction *instr)
+{
+       return (opc_cat(instr->opc) == 7);
+}
+
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+       /* these instructions, the "destination" register is
+        * actually a source, the address to store to.
+        */
+       switch (instr->opc) {
+       case OPC_STG:
+       case OPC_STGB:
+       case OPC_STIB:
+       case OPC_STP:
+       case OPC_STL:
+       case OPC_STLW:
+       case OPC_L2G:
+       case OPC_G2L:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_load(struct ir3_instruction *instr)
+{
+       switch (instr->opc) {
+       case OPC_LDG:
+       case OPC_LDGB:
+       case OPC_LDL:
+       case OPC_LDP:
+       case OPC_L2G:
+       case OPC_LDLW:
+       case OPC_LDC:
+       case OPC_LDLV:
+               /* probably some others too.. */
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_input(struct ir3_instruction *instr)
+{
+       /* in some cases, ldlv is used to fetch varying without
+        * interpolation.. fortunately inloc is the first src
+        * register in either case
+        */
+       switch (instr->opc) {
+       case OPC_LDLV:
+       case OPC_BARY_F:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_bool(struct ir3_instruction *instr)
+{
+       switch (instr->opc) {
+       case OPC_CMPS_F:
+       case OPC_CMPS_S:
+       case OPC_CMPS_U:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static inline bool is_meta(struct ir3_instruction *instr)
+{
+       /* TODO how should we count PHI (and maybe fan-in/out) which
+        * might actually contribute some instructions to the final
+        * result?
+        */
+       return (opc_cat(instr->opc) == -1);
+}
+
+static inline bool writes_addr(struct ir3_instruction *instr)
+{
+       if (instr->regs_count > 0) {
+               struct ir3_register *dst = instr->regs[0];
+               return reg_num(dst) == REG_A0;
+       }
+       return false;
+}
+
+static inline bool writes_pred(struct ir3_instruction *instr)
+{
+       if (instr->regs_count > 0) {
+               struct ir3_register *dst = instr->regs[0];
+               return reg_num(dst) == REG_P0;
+       }
+       return false;
+}
+
+/* returns defining instruction for reg */
+/* TODO better name */
+static inline struct ir3_instruction *ssa(struct ir3_register *reg)
+{
+       if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+               return reg->instr;
+       }
+       return NULL;
+}
+
+static inline bool conflicts(struct ir3_instruction *a,
+               struct ir3_instruction *b)
+{
+       return (a && b) && (a != b);
+}
+
+static inline bool reg_gpr(struct ir3_register *r)
+{
+       if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+               return false;
+       if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+               return false;
+       return true;
+}
+
+static inline type_t half_type(type_t type)
+{
+       switch (type) {
+       case TYPE_F32: return TYPE_F16;
+       case TYPE_U32: return TYPE_U16;
+       case TYPE_S32: return TYPE_S16;
+       case TYPE_F16:
+       case TYPE_U16:
+       case TYPE_S16:
+               return type;
+       default:
+               assert(0);
+               return ~0;
+       }
+}
+
+/* some cat2 instructions (ie. those which are not float) can embed an
+ * immediate:
+ */
+static inline bool ir3_cat2_int(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ADD_U:
+       case OPC_ADD_S:
+       case OPC_SUB_U:
+       case OPC_SUB_S:
+       case OPC_CMPS_U:
+       case OPC_CMPS_S:
+       case OPC_MIN_U:
+       case OPC_MIN_S:
+       case OPC_MAX_U:
+       case OPC_MAX_S:
+       case OPC_CMPV_U:
+       case OPC_CMPV_S:
+       case OPC_MUL_U:
+       case OPC_MUL_S:
+       case OPC_MULL_U:
+       case OPC_CLZ_S:
+       case OPC_ABSNEG_S:
+       case OPC_AND_B:
+       case OPC_OR_B:
+       case OPC_NOT_B:
+       case OPC_XOR_B:
+       case OPC_BFREV_B:
+       case OPC_CLZ_B:
+       case OPC_SHL_B:
+       case OPC_SHR_B:
+       case OPC_ASHR_B:
+       case OPC_MGEN_B:
+       case OPC_GETBIT_B:
+       case OPC_CBITS_B:
+       case OPC_BARY_F:
+               return true;
+
+       default:
+               return false;
+       }
+}
+
+
+/* map cat2 instruction to valid abs/neg flags: */
+static inline unsigned ir3_cat2_absneg(opc_t opc)
+{
+       switch (opc) {
+       case OPC_ADD_F:
+       case OPC_MIN_F:
+       case OPC_MAX_F:
+       case OPC_MUL_F:
+       case OPC_SIGN_F:
+       case OPC_CMPS_F:
+       case OPC_ABSNEG_F:
+       case OPC_CMPV_F:
+       case OPC_FLOOR_F:
+       case OPC_CEIL_F:
+       case OPC_RNDNE_F:
+       case OPC_RNDAZ_F:
+       case OPC_TRUNC_F:
+       case OPC_BARY_F:
+               return IR3_REG_FABS | IR3_REG_FNEG;
+
+       case OPC_ADD_U:
+       case OPC_ADD_S:
+       case OPC_SUB_U:
+       case OPC_SUB_S:
+       case OPC_CMPS_U:
+       case OPC_CMPS_S:
+       case OPC_MIN_U:
+       case OPC_MIN_S:
+       case OPC_MAX_U:
+       case OPC_MAX_S:
+       case OPC_CMPV_U:
+       case OPC_CMPV_S:
+       case OPC_MUL_U:
+       case OPC_MUL_S:
+       case OPC_MULL_U:
+       case OPC_CLZ_S:
+               return 0;
+
+       case OPC_ABSNEG_S:
+               return IR3_REG_SABS | IR3_REG_SNEG;
+
+       case OPC_AND_B:
+       case OPC_OR_B:
+       case OPC_NOT_B:
+       case OPC_XOR_B:
+       case OPC_BFREV_B:
+       case OPC_CLZ_B:
+       case OPC_SHL_B:
+       case OPC_SHR_B:
+       case OPC_ASHR_B:
+       case OPC_MGEN_B:
+       case OPC_GETBIT_B:
+       case OPC_CBITS_B:
+               return IR3_REG_BNOT;
+
+       default:
+               return 0;
+       }
+}
+
+/* map cat3 instructions to valid abs/neg flags: */
+static inline unsigned ir3_cat3_absneg(opc_t opc)
+{
+       switch (opc) {
+       case OPC_MAD_F16:
+       case OPC_MAD_F32:
+       case OPC_SEL_F16:
+       case OPC_SEL_F32:
+               return IR3_REG_FNEG;
+
+       case OPC_MAD_U16:
+       case OPC_MADSH_U16:
+       case OPC_MAD_S16:
+       case OPC_MADSH_M16:
+       case OPC_MAD_U24:
+       case OPC_MAD_S24:
+       case OPC_SEL_S16:
+       case OPC_SEL_S32:
+       case OPC_SAD_S16:
+       case OPC_SAD_S32:
+               /* neg *may* work on 3rd src.. */
+
+       case OPC_SEL_B16:
+       case OPC_SEL_B32:
+
+       default:
+               return 0;
+       }
+}
+
+#define MASK(n) ((1 << (n)) - 1)
+
+/* iterator for an instructions's sources (reg), also returns src #: */
+#define foreach_src_n(__srcreg, __n, __instr) \
+       if ((__instr)->regs_count) \
+               for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
+                       if ((__srcreg = (__instr)->regs[__n + 1]))
+
+/* iterator for an instructions's sources (reg): */
+#define foreach_src(__srcreg, __instr) \
+       foreach_src_n(__srcreg, __i, __instr)
+
+static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
+{
+       unsigned cnt = instr->regs_count + instr->deps_count;
+       if (instr->address)
+               cnt++;
+       return cnt;
+}
+
+static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
+{
+       if (n == (instr->regs_count + instr->deps_count))
+               return instr->address;
+       if (n >= instr->regs_count)
+               return instr->deps[n - instr->regs_count];
+       return ssa(instr->regs[n]);
+}
+
+static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
+{
+       if (n == (instr->regs_count + instr->deps_count))
+               return false;
+       if (n >= instr->regs_count)
+               return true;
+       return false;
+}
+
+#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
+
+/* iterator for an instruction's SSA sources (instr), also returns src #: */
+#define foreach_ssa_src_n(__srcinst, __n, __instr) \
+       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+               if ((__srcinst = __ssa_src_n(__instr, __n)))
+
+/* iterator for an instruction's SSA sources (instr): */
+#define foreach_ssa_src(__srcinst, __instr) \
+       foreach_ssa_src_n(__srcinst, __i, __instr)
+
+
+/* dump: */
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
+
+/* depth calculation: */
+int ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
+
+/* copy-propagate: */
+struct ir3_shader_variant;
+void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
+
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
+
+/* scheduling: */
+void ir3_sched_add_deps(struct ir3 *ir);
+int ir3_sched(struct ir3 *ir);
+
+/* register assignment: */
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
+int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
+               bool frag_coord, bool frag_face);
+
+/* legalize: */
+void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
+
+/* ************************************************************************* */
+/* instruction helpers */
+
+/* creates SSA src of correct type (ie. half vs full precision) */
+static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
+               struct ir3_instruction *src, unsigned flags)
+{
+       struct ir3_register *reg;
+       if (src->regs[0]->flags & IR3_REG_HALF)
+               flags |= IR3_REG_HALF;
+       reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
+       reg->instr = src;
+       return reg;
+}
+
+static inline struct ir3_instruction *
+ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
+{
+       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+       ir3_reg_create(instr, 0, 0);   /* dst */
+       if (src->regs[0]->flags & IR3_REG_ARRAY) {
+               struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
+               src_reg->array = src->regs[0]->array;
+       } else {
+               __ssa_src(instr, src, 0);
+       }
+       debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
+       instr->cat1.src_type = type;
+       instr->cat1.dst_type = type;
+       return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
+               type_t src_type, type_t dst_type)
+{
+       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
+       unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
+       unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
+
+       debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
+
+       ir3_reg_create(instr, 0, dst_flags);   /* dst */
+       __ssa_src(instr, src, 0);
+       instr->cat1.src_type = src_type;
+       instr->cat1.dst_type = dst_type;
+       debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
+       return instr;
+}
+
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+       return ir3_instr_create(block, OPC_NOP);
+}
+
+#define INSTR0(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create(block, OPC_##name);                             \
+       return instr;                                                        \
+}
+
+#define INSTR1(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+               struct ir3_instruction *a, unsigned aflags)                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create(block, OPC_##name);                             \
+       ir3_reg_create(instr, 0, 0);   /* dst */                             \
+       __ssa_src(instr, a, aflags);                                         \
+       return instr;                                                        \
+}
+
+#define INSTR2(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+               struct ir3_instruction *a, unsigned aflags,                      \
+               struct ir3_instruction *b, unsigned bflags)                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create(block, OPC_##name);                             \
+       ir3_reg_create(instr, 0, 0);   /* dst */                             \
+       __ssa_src(instr, a, aflags);                                         \
+       __ssa_src(instr, b, bflags);                                         \
+       return instr;                                                        \
+}
+
+#define INSTR3(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+               struct ir3_instruction *a, unsigned aflags,                      \
+               struct ir3_instruction *b, unsigned bflags,                      \
+               struct ir3_instruction *c, unsigned cflags)                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create(block, OPC_##name);                             \
+       ir3_reg_create(instr, 0, 0);   /* dst */                             \
+       __ssa_src(instr, a, aflags);                                         \
+       __ssa_src(instr, b, bflags);                                         \
+       __ssa_src(instr, c, cflags);                                         \
+       return instr;                                                        \
+}
+
+#define INSTR4(name)                                                     \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block,                                      \
+               struct ir3_instruction *a, unsigned aflags,                      \
+               struct ir3_instruction *b, unsigned bflags,                      \
+               struct ir3_instruction *c, unsigned cflags,                      \
+               struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create2(block, OPC_##name, 5);                         \
+       ir3_reg_create(instr, 0, 0);   /* dst */                             \
+       __ssa_src(instr, a, aflags);                                         \
+       __ssa_src(instr, b, bflags);                                         \
+       __ssa_src(instr, c, cflags);                                         \
+       __ssa_src(instr, d, dflags);                                         \
+       return instr;                                                        \
+}
+
+#define INSTR4F(f, name)                                                 \
+static inline struct ir3_instruction *                                   \
+ir3_##name##_##f(struct ir3_block *block,                                \
+               struct ir3_instruction *a, unsigned aflags,                      \
+               struct ir3_instruction *b, unsigned bflags,                      \
+               struct ir3_instruction *c, unsigned cflags,                      \
+               struct ir3_instruction *d, unsigned dflags)                      \
+{                                                                        \
+       struct ir3_instruction *instr =                                      \
+               ir3_instr_create2(block, OPC_##name, 5);                         \
+       ir3_reg_create(instr, 0, 0);   /* dst */                             \
+       __ssa_src(instr, a, aflags);                                         \
+       __ssa_src(instr, b, bflags);                                         \
+       __ssa_src(instr, c, cflags);                                         \
+       __ssa_src(instr, d, dflags);                                         \
+       instr->flags |= IR3_INSTR_##f;                                       \
+       return instr;                                                        \
+}
+
+/* cat0 instructions: */
+INSTR0(BR)
+INSTR0(JUMP)
+INSTR1(KILL)
+INSTR0(END)
+
+/* cat2 instructions, most 2 src but some 1 src: */
+INSTR2(ADD_F)
+INSTR2(MIN_F)
+INSTR2(MAX_F)
+INSTR2(MUL_F)
+INSTR1(SIGN_F)
+INSTR2(CMPS_F)
+INSTR1(ABSNEG_F)
+INSTR2(CMPV_F)
+INSTR1(FLOOR_F)
+INSTR1(CEIL_F)
+INSTR1(RNDNE_F)
+INSTR1(RNDAZ_F)
+INSTR1(TRUNC_F)
+INSTR2(ADD_U)
+INSTR2(ADD_S)
+INSTR2(SUB_U)
+INSTR2(SUB_S)
+INSTR2(CMPS_U)
+INSTR2(CMPS_S)
+INSTR2(MIN_U)
+INSTR2(MIN_S)
+INSTR2(MAX_U)
+INSTR2(MAX_S)
+INSTR1(ABSNEG_S)
+INSTR2(AND_B)
+INSTR2(OR_B)
+INSTR1(NOT_B)
+INSTR2(XOR_B)
+INSTR2(CMPV_U)
+INSTR2(CMPV_S)
+INSTR2(MUL_U)
+INSTR2(MUL_S)
+INSTR2(MULL_U)
+INSTR1(BFREV_B)
+INSTR1(CLZ_S)
+INSTR1(CLZ_B)
+INSTR2(SHL_B)
+INSTR2(SHR_B)
+INSTR2(ASHR_B)
+INSTR2(BARY_F)
+INSTR2(MGEN_B)
+INSTR2(GETBIT_B)
+INSTR1(SETRM)
+INSTR1(CBITS_B)
+INSTR2(SHB)
+INSTR2(MSAD)
+
+/* cat3 instructions: */
+INSTR3(MAD_U16)
+INSTR3(MADSH_U16)
+INSTR3(MAD_S16)
+INSTR3(MADSH_M16)
+INSTR3(MAD_U24)
+INSTR3(MAD_S24)
+INSTR3(MAD_F16)
+INSTR3(MAD_F32)
+INSTR3(SEL_B16)
+INSTR3(SEL_B32)
+INSTR3(SEL_S16)
+INSTR3(SEL_S32)
+INSTR3(SEL_F16)
+INSTR3(SEL_F32)
+INSTR3(SAD_S16)
+INSTR3(SAD_S32)
+
+/* cat4 instructions: */
+INSTR1(RCP)
+INSTR1(RSQ)
+INSTR1(LOG2)
+INSTR1(EXP2)
+INSTR1(SIN)
+INSTR1(COS)
+INSTR1(SQRT)
+
+/* cat5 instructions: */
+INSTR1(DSX)
+INSTR1(DSY)
+
+static inline struct ir3_instruction *
+ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
+               unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
+               struct ir3_instruction *src0, struct ir3_instruction *src1)
+{
+       struct ir3_instruction *sam;
+       struct ir3_register *reg;
+
+       sam = ir3_instr_create(block, opc);
+       sam->flags |= flags;
+       ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
+       if (src0) {
+               reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+               reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
+               reg->instr = src0;
+       }
+       if (src1) {
+               reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
+               reg->instr = src1;
+               reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
+       }
+       sam->cat5.samp = samp;
+       sam->cat5.tex  = tex;
+       sam->cat5.type  = type;
+
+       return sam;
+}
+
+/* cat6 instructions: */
+INSTR2(LDLV)
+INSTR2(LDG)
+INSTR2(LDL)
+INSTR3(STG)
+INSTR3(STL)
+INSTR3(LDGB)
+INSTR4(STGB)
+INSTR4(STIB)
+INSTR1(RESINFO)
+INSTR1(RESFMT)
+INSTR2(ATOMIC_ADD)
+INSTR2(ATOMIC_SUB)
+INSTR2(ATOMIC_XCHG)
+INSTR2(ATOMIC_INC)
+INSTR2(ATOMIC_DEC)
+INSTR2(ATOMIC_CMPXCHG)
+INSTR2(ATOMIC_MIN)
+INSTR2(ATOMIC_MAX)
+INSTR2(ATOMIC_AND)
+INSTR2(ATOMIC_OR)
+INSTR2(ATOMIC_XOR)
+INSTR4F(G, ATOMIC_ADD)
+INSTR4F(G, ATOMIC_SUB)
+INSTR4F(G, ATOMIC_XCHG)
+INSTR4F(G, ATOMIC_INC)
+INSTR4F(G, ATOMIC_DEC)
+INSTR4F(G, ATOMIC_CMPXCHG)
+INSTR4F(G, ATOMIC_MIN)
+INSTR4F(G, ATOMIC_MAX)
+INSTR4F(G, ATOMIC_AND)
+INSTR4F(G, ATOMIC_OR)
+INSTR4F(G, ATOMIC_XOR)
+
+/* cat7 instructions: */
+INSTR0(BAR)
+INSTR0(FENCE)
+
+/* ************************************************************************* */
+/* split this out or find some helper to use.. like main/bitset.h.. */
+
+#include <string.h>
+
+#define MAX_REG 256
+
+typedef uint8_t regmask_t[2 * MAX_REG / 8];
+
+static inline unsigned regmask_idx(struct ir3_register *reg)
+{
+       unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
+       debug_assert(num < MAX_REG);
+       if (reg->flags & IR3_REG_HALF)
+               num += MAX_REG;
+       return num;
+}
+
+static inline void regmask_init(regmask_t *regmask)
+{
+       memset(regmask, 0, sizeof(*regmask));
+}
+
+static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
+{
+       unsigned idx = regmask_idx(reg);
+       if (reg->flags & IR3_REG_RELATIV) {
+               unsigned i;
+               for (i = 0; i < reg->size; i++, idx++)
+                       (*regmask)[idx / 8] |= 1 << (idx % 8);
+       } else {
+               unsigned mask;
+               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+                       if (mask & 1)
+                               (*regmask)[idx / 8] |= 1 << (idx % 8);
+       }
+}
+
+static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
+{
+       unsigned i;
+       for (i = 0; i < ARRAY_SIZE(*dst); i++)
+               (*dst)[i] = (*a)[i] | (*b)[i];
+}
+
+/* set bits in a if not set in b, conceptually:
+ *   a |= (reg & ~b)
+ */
+static inline void regmask_set_if_not(regmask_t *a,
+               struct ir3_register *reg, regmask_t *b)
+{
+       unsigned idx = regmask_idx(reg);
+       if (reg->flags & IR3_REG_RELATIV) {
+               unsigned i;
+               for (i = 0; i < reg->size; i++, idx++)
+                       if (!((*b)[idx / 8] & (1 << (idx % 8))))
+                               (*a)[idx / 8] |= 1 << (idx % 8);
+       } else {
+               unsigned mask;
+               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+                       if (mask & 1)
+                               if (!((*b)[idx / 8] & (1 << (idx % 8))))
+                                       (*a)[idx / 8] |= 1 << (idx % 8);
+       }
+}
+
+static inline bool regmask_get(regmask_t *regmask,
+               struct ir3_register *reg)
+{
+       unsigned idx = regmask_idx(reg);
+       if (reg->flags & IR3_REG_RELATIV) {
+               unsigned i;
+               for (i = 0; i < reg->size; i++, idx++)
+                       if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+                               return true;
+       } else {
+               unsigned mask;
+               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
+                       if (mask & 1)
+                               if ((*regmask)[idx / 8] & (1 << (idx % 8)))
+                                       return true;
+       }
+       return false;
+}
+
+/* ************************************************************************* */
+
+#endif /* IR3_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c

new file mode 100644 (file)

index 0000000..f00daeb
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/ralloc.h"
+
+#include "ir3_compiler.h"
+
+static const struct debug_named_value shader_debug_options[] = {
+               {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
+               {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
+               {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
+               {"disasm",  IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
+               {"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
+               DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
+
+enum ir3_shader_debug ir3_shader_debug = 0;
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
+{
+       struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+
+       ir3_shader_debug = debug_get_option_ir3_shader_debug();
+
+       compiler->dev = dev;
+       compiler->gpu_id = gpu_id;
+       compiler->set = ir3_ra_alloc_reg_set(compiler);
+
+       if (compiler->gpu_id >= 400) {
+               /* need special handling for "flat" */
+               compiler->flat_bypass = true;
+               compiler->levels_add_one = false;
+               compiler->unminify_coords = false;
+               compiler->txf_ms_with_isaml = false;
+               compiler->array_index_add_half = true;
+       } else {
+               /* no special handling for "flat" */
+               compiler->flat_bypass = false;
+               compiler->levels_add_one = true;
+               compiler->unminify_coords = true;
+               compiler->txf_ms_with_isaml = true;
+               compiler->array_index_add_half = false;
+       }
+
+       return compiler;
+}
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h

new file mode 100644 (file)

index 0000000..e233606
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IR3_COMPILER_H_
+#define IR3_COMPILER_H_
+
+#include "ir3_shader.h"
+
+struct ir3_ra_reg_set;
+
+struct ir3_compiler {
+       struct fd_device *dev;
+       uint32_t gpu_id;
+       struct ir3_ra_reg_set *set;
+       uint32_t shader_count;
+
+       /*
+        * Configuration options for things that are handled differently on
+        * different generations:
+        */
+
+       /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
+        * so we need to use ldlv.u32 to load the varying directly:
+        */
+       bool flat_bypass;
+
+       /* on a3xx, we need to add one to # of array levels:
+        */
+       bool levels_add_one;
+
+       /* on a3xx, we need to scale up integer coords for isaml based
+        * on LoD:
+        */
+       bool unminify_coords;
+
+       /* on a3xx do txf_ms w/ isaml and scaled coords: */
+       bool txf_ms_with_isaml;
+
+       /* on a4xx, for array textures we need to add 0.5 to the array
+        * index coordinate:
+        */
+       bool array_index_add_half;
+};
+
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+               struct ir3_shader_variant *so);
+
+enum ir3_shader_debug {
+       IR3_DBG_SHADER_VS = 0x01,
+       IR3_DBG_SHADER_FS = 0x02,
+       IR3_DBG_SHADER_CS = 0x04,
+       IR3_DBG_DISASM    = 0x08,
+       IR3_DBG_OPTMSGS   = 0x10,
+};
+
+extern enum ir3_shader_debug ir3_shader_debug;
+
+static inline bool
+shader_debug_enabled(gl_shader_stage type)
+{
+       switch (type) {
+       case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
+       case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
+       case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
+       default:
+               debug_assert(0);
+               return false;
+       }
+}
+
+#endif /* IR3_COMPILER_H_ */
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c

new file mode 100644 (file)

index 0000000..445a2b2
--- /dev/null
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -0,0 +1,3818 @@
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+#include "ir3_nir.h"
+
+#include "instr-a3xx.h"
+#include "ir3.h"
+
+/* for conditionally setting boolean flag(s): */
+#define COND(bool, val) ((bool) ? (val) : 0)
+
+#define DBG(fmt, ...) \
+               do { debug_printf("%s:%d: "fmt "\n", \
+                               __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
+
+struct ir3_context {
+       struct ir3_compiler *compiler;
+
+       struct nir_shader *s;
+
+       struct nir_instr *cur_instr;  /* current instruction, just for debug */
+
+       struct ir3 *ir;
+       struct ir3_shader_variant *so;
+
+       struct ir3_block *block;      /* the current block */
+       struct ir3_block *in_block;   /* block created for shader inputs */
+
+       nir_function_impl *impl;
+
+       /* For fragment shaders, varyings are not actual shader inputs,
+        * instead the hw passes a varying-coord which is used with
+        * bary.f.
+        *
+        * But NIR doesn't know that, it still declares varyings as
+        * inputs.  So we do all the input tracking normally and fix
+        * things up after compile_instructions()
+        *
+        * NOTE that frag_vcoord is the hardware position (possibly it
+        * is actually an index or tag or some such.. it is *not*
+        * values that can be directly used for gl_FragCoord..)
+        */
+       struct ir3_instruction *frag_vcoord;
+
+       /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
+       struct ir3_instruction *frag_face, *frag_coord;
+
+       /* For vertex shaders, keep track of the system values sources */
+       struct ir3_instruction *vertex_id, *basevertex, *instance_id;
+
+       /* For fragment shaders: */
+       struct ir3_instruction *samp_id, *samp_mask_in;
+
+       /* Compute shader inputs: */
+       struct ir3_instruction *local_invocation_id, *work_group_id;
+
+       /* mapping from nir_register to defining instruction: */
+       struct hash_table *def_ht;
+
+       unsigned num_arrays;
+
+       /* a common pattern for indirect addressing is to request the
+        * same address register multiple times.  To avoid generating
+        * duplicate instruction sequences (which our backend does not
+        * try to clean up, since that should be done as the NIR stage)
+        * we cache the address value generated for a given src value:
+        *
+        * Note that we have to cache these per alignment, since same
+        * src used for an array of vec1 cannot be also used for an
+        * array of vec4.
+        */
+       struct hash_table *addr_ht[4];
+
+       /* last dst array, for indirect we need to insert a var-store.
+        */
+       struct ir3_instruction **last_dst;
+       unsigned last_dst_n;
+
+       /* maps nir_block to ir3_block, mostly for the purposes of
+        * figuring out the blocks successors
+        */
+       struct hash_table *block_ht;
+
+       /* on a4xx, bitmask of samplers which need astc+srgb workaround: */
+       unsigned astc_srgb;
+
+       unsigned samples;             /* bitmask of x,y sample shifts */
+
+       unsigned max_texture_index;
+
+       /* set if we encounter something we can't handle yet, so we
+        * can bail cleanly and fallback to TGSI compiler f/e
+        */
+       bool error;
+};
+
+/* gpu pointer size in units of 32bit registers/slots */
+static unsigned pointer_size(struct ir3_context *ctx)
+{
+       return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
+}
+
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
+
+
+static struct ir3_context *
+compile_init(struct ir3_compiler *compiler,
+               struct ir3_shader_variant *so)
+{
+       struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
+
+       if (compiler->gpu_id >= 400) {
+               if (so->type == MESA_SHADER_VERTEX) {
+                       ctx->astc_srgb = so->key.vastc_srgb;
+               } else if (so->type == MESA_SHADER_FRAGMENT) {
+                       ctx->astc_srgb = so->key.fastc_srgb;
+               }
+
+       } else {
+               if (so->type == MESA_SHADER_VERTEX) {
+                       ctx->samples = so->key.vsamples;
+               } else if (so->type == MESA_SHADER_FRAGMENT) {
+                       ctx->samples = so->key.fsamples;
+               }
+       }
+
+       ctx->compiler = compiler;
+       ctx->so = so;
+       ctx->def_ht = _mesa_hash_table_create(ctx,
+                       _mesa_hash_pointer, _mesa_key_pointer_equal);
+       ctx->block_ht = _mesa_hash_table_create(ctx,
+                       _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+       /* TODO: maybe generate some sort of bitmask of what key
+        * lowers vs what shader has (ie. no need to lower
+        * texture clamp lowering if no texture sample instrs)..
+        * although should be done further up the stack to avoid
+        * creating duplicate variants..
+        */
+
+       if (ir3_key_lowers_nir(&so->key)) {
+               nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+               ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+       } else {
+               /* fast-path for shader key that lowers nothing in NIR: */
+               ctx->s = so->shader->nir;
+       }
+
+       /* this needs to be the last pass run, so do this here instead of
+        * in ir3_optimize_nir():
+        */
+       NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
+       NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+
+       if (ir3_shader_debug & IR3_DBG_DISASM) {
+               printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
+                       so->shader->id, so->id, so->type,
+                       so->key.color_two_side, so->key.half_precision);
+               nir_print_shader(ctx->s, stdout);
+       }
+
+       if (shader_debug_enabled(so->type)) {
+               fprintf(stderr, "NIR (final form) for %s shader:\n",
+                       _mesa_shader_stage_to_string(so->type));
+               nir_print_shader(ctx->s, stderr);
+       }
+
+       ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
+
+       so->num_uniforms = ctx->s->num_uniforms;
+       so->num_ubos = ctx->s->info.num_ubos;
+
+       /* Layout of constant registers, each section aligned to vec4.  Note
+        * that pointer size (ubo, etc) changes depending on generation.
+        *
+        *    user consts
+        *    UBO addresses
+        *    SSBO sizes
+        *    if (vertex shader) {
+        *        driver params (IR3_DP_*)
+        *        if (stream_output.num_outputs > 0)
+        *           stream-out addresses
+        *    }
+        *    immediates
+        *
+        * Immediates go last mostly because they are inserted in the CP pass
+        * after the nir -> ir3 frontend.
+        */
+       unsigned constoff = align(ctx->s->num_uniforms, 4);
+       unsigned ptrsz = pointer_size(ctx);
+
+       memset(&so->constbase, ~0, sizeof(so->constbase));
+
+       if (so->num_ubos > 0) {
+               so->constbase.ubo = constoff;
+               constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
+       }
+
+       if (so->const_layout.ssbo_size.count > 0) {
+               unsigned cnt = so->const_layout.ssbo_size.count;
+               so->constbase.ssbo_sizes = constoff;
+               constoff += align(cnt, 4) / 4;
+       }
+
+       if (so->const_layout.image_dims.count > 0) {
+               unsigned cnt = so->const_layout.image_dims.count;
+               so->constbase.image_dims = constoff;
+               constoff += align(cnt, 4) / 4;
+       }
+
+       unsigned num_driver_params = 0;
+       if (so->type == MESA_SHADER_VERTEX) {
+               num_driver_params = IR3_DP_VS_COUNT;
+       } else if (so->type == MESA_SHADER_COMPUTE) {
+               num_driver_params = IR3_DP_CS_COUNT;
+       }
+
+       so->constbase.driver_param = constoff;
+       constoff += align(num_driver_params, 4) / 4;
+
+       if ((so->type == MESA_SHADER_VERTEX) &&
+                       (compiler->gpu_id < 500) &&
+                       so->shader->stream_output.num_outputs > 0) {
+               so->constbase.tfbo = constoff;
+               constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+       }
+
+       so->constbase.immediate = constoff;
+
+       return ctx;
+}
+
+static void
+compile_error(struct ir3_context *ctx, const char *format, ...)
+{
+       struct hash_table *errors = NULL;
+       va_list ap;
+       va_start(ap, format);
+       if (ctx->cur_instr) {
+               errors = _mesa_hash_table_create(NULL,
+                               _mesa_hash_pointer,
+                               _mesa_key_pointer_equal);
+               char *msg = ralloc_vasprintf(errors, format, ap);
+               _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
+       } else {
+               _debug_vprintf(format, ap);
+       }
+       va_end(ap);
+       nir_print_shader_annotated(ctx->s, stdout, errors);
+       ralloc_free(errors);
+       ctx->error = true;
+       debug_assert(0);
+}
+
+#define compile_assert(ctx, cond) do { \
+               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
+       } while (0)
+
+static void
+compile_free(struct ir3_context *ctx)
+{
+       ralloc_free(ctx);
+}
+
+static void
+declare_array(struct ir3_context *ctx, nir_register *reg)
+{
+       struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
+       arr->id = ++ctx->num_arrays;
+       /* NOTE: sometimes we get non array regs, for example for arrays of
+        * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
+        * treat a non-array as if it was an array of length 1.
+        *
+        * It would be nice if there was a nir pass to convert arrays of
+        * length 1 to ssa.
+        */
+       arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
+       compile_assert(ctx, arr->length > 0);
+       arr->r = reg;
+       list_addtail(&arr->node, &ctx->ir->array_list);
+}
+
+static struct ir3_array *
+get_array(struct ir3_context *ctx, nir_register *reg)
+{
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               if (arr->r == reg)
+                       return arr;
+       }
+       compile_error(ctx, "bogus reg: %s\n", reg->name);
+       return NULL;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
+               struct ir3_instruction *address)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *mov;
+       struct ir3_register *src;
+
+       mov = ir3_instr_create(block, OPC_MOV);
+       mov->cat1.src_type = TYPE_U32;
+       mov->cat1.dst_type = TYPE_U32;
+       mov->barrier_class = IR3_BARRIER_ARRAY_R;
+       mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
+       ir3_reg_create(mov, 0, 0);
+       src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+                       COND(address, IR3_REG_RELATIV));
+       src->instr = arr->last_write;
+       src->size  = arr->length;
+       src->array.id = arr->id;
+       src->array.offset = n;
+
+       if (address)
+               ir3_instr_set_address(mov, address);
+
+       return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static void
+create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
+               struct ir3_instruction *src, struct ir3_instruction *address)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *mov;
+       struct ir3_register *dst;
+
+       /* if not relative store, don't create an extra mov, since that
+        * ends up being difficult for cp to remove.
+        */
+       if (!address) {
+               dst = src->regs[0];
+
+               src->barrier_class |= IR3_BARRIER_ARRAY_W;
+               src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+
+               dst->flags |= IR3_REG_ARRAY;
+               dst->instr = arr->last_write;
+               dst->size = arr->length;
+               dst->array.id = arr->id;
+               dst->array.offset = n;
+
+               arr->last_write = src;
+
+               array_insert(block, block->keeps, src);
+
+               return;
+       }
+
+       mov = ir3_instr_create(block, OPC_MOV);
+       mov->cat1.src_type = TYPE_U32;
+       mov->cat1.dst_type = TYPE_U32;
+       mov->barrier_class = IR3_BARRIER_ARRAY_W;
+       mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
+       dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+                       COND(address, IR3_REG_RELATIV));
+       dst->instr = arr->last_write;
+       dst->size  = arr->length;
+       dst->array.id = arr->id;
+       dst->array.offset = n;
+       ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
+
+       if (address)
+               ir3_instr_set_address(mov, address);
+
+       arr->last_write = mov;
+
+       /* the array store may only matter to something in an earlier
+        * block (ie. loops), but since arrays are not in SSA, depth
+        * pass won't know this.. so keep all array stores:
+        */
+       array_insert(block, block->keeps, mov);
+}
+
+static inline type_t utype_for_size(unsigned bit_size)
+{
+       switch (bit_size) {
+       case 32: return TYPE_U32;
+       case 16: return TYPE_U16;
+       case  8: return TYPE_U8;
+       default: unreachable("bad bitsize"); return ~0;
+       }
+}
+
+static inline type_t utype_src(nir_src src)
+{ return utype_for_size(nir_src_bit_size(src)); }
+
+static inline type_t utype_dst(nir_dest dst)
+{ return utype_for_size(nir_dest_bit_size(dst)); }
+
+/* allocate a n element value array (to be populated by caller) and
+ * insert in def_ht
+ */
+static struct ir3_instruction **
+get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
+{
+       struct ir3_instruction **value =
+               ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
+       _mesa_hash_table_insert(ctx->def_ht, dst, value);
+       return value;
+}
+
+static struct ir3_instruction **
+get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
+{
+       struct ir3_instruction **value;
+
+       if (dst->is_ssa) {
+               value = get_dst_ssa(ctx, &dst->ssa, n);
+       } else {
+               value = ralloc_array(ctx, struct ir3_instruction *, n);
+       }
+
+       /* NOTE: in non-ssa case, we don't really need to store last_dst
+        * but this helps us catch cases where put_dst() call is forgotten
+        */
+       compile_assert(ctx, !ctx->last_dst);
+       ctx->last_dst = value;
+       ctx->last_dst_n = n;
+
+       return value;
+}
+
+static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
+
+static struct ir3_instruction * const *
+get_src(struct ir3_context *ctx, nir_src *src)
+{
+       if (src->is_ssa) {
+               struct hash_entry *entry;
+               entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
+               compile_assert(ctx, entry);
+               return entry->data;
+       } else {
+               nir_register *reg = src->reg.reg;
+               struct ir3_array *arr = get_array(ctx, reg);
+               unsigned num_components = arr->r->num_components;
+               struct ir3_instruction *addr = NULL;
+               struct ir3_instruction **value =
+                       ralloc_array(ctx, struct ir3_instruction *, num_components);
+
+               if (src->reg.indirect)
+                       addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
+                                       reg->num_components);
+
+               for (unsigned i = 0; i < num_components; i++) {
+                       unsigned n = src->reg.base_offset * reg->num_components + i;
+                       compile_assert(ctx, n < arr->length);
+                       value[i] = create_array_load(ctx, arr, n, addr);
+               }
+
+               return value;
+       }
+}
+
+static void
+put_dst(struct ir3_context *ctx, nir_dest *dst)
+{
+       unsigned bit_size = nir_dest_bit_size(*dst);
+
+       if (bit_size < 32) {
+               for (unsigned i = 0; i < ctx->last_dst_n; i++) {
+                       struct ir3_instruction *dst = ctx->last_dst[i];
+                       dst->regs[0]->flags |= IR3_REG_HALF;
+                       if (ctx->last_dst[i]->opc == OPC_META_FO)
+                               dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
+               }
+       }
+
+       if (!dst->is_ssa) {
+               nir_register *reg = dst->reg.reg;
+               struct ir3_array *arr = get_array(ctx, reg);
+               unsigned num_components = ctx->last_dst_n;
+               struct ir3_instruction *addr = NULL;
+
+               if (dst->reg.indirect)
+                       addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
+                                       reg->num_components);
+
+               for (unsigned i = 0; i < num_components; i++) {
+                       unsigned n = dst->reg.base_offset * reg->num_components + i;
+                       compile_assert(ctx, n < arr->length);
+                       if (!ctx->last_dst[i])
+                               continue;
+                       create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
+               }
+
+               ralloc_free(ctx->last_dst);
+       }
+       ctx->last_dst = NULL;
+       ctx->last_dst_n = 0;
+}
+
+static struct ir3_instruction *
+create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
+{
+       struct ir3_instruction *mov;
+       unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
+
+       mov = ir3_instr_create(block, OPC_MOV);
+       mov->cat1.src_type = type;
+       mov->cat1.dst_type = type;
+       ir3_reg_create(mov, 0, flags);
+       ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
+
+       return mov;
+}
+
+static struct ir3_instruction *
+create_immed(struct ir3_block *block, uint32_t val)
+{
+       return create_immed_typed(block, val, TYPE_U32);
+}
+
+static struct ir3_instruction *
+create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
+{
+       struct ir3_instruction *instr, *immed;
+
+       /* TODO in at least some cases, the backend could probably be
+        * made clever enough to propagate IR3_REG_HALF..
+        */
+       instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
+       instr->regs[0]->flags |= IR3_REG_HALF;
+
+       switch(align){
+       case 1:
+               /* src *= 1: */
+               break;
+       case 2:
+               /* src *= 2     => src <<= 1: */
+               immed = create_immed(block, 1);
+               immed->regs[0]->flags |= IR3_REG_HALF;
+
+               instr = ir3_SHL_B(block, instr, 0, immed, 0);
+               instr->regs[0]->flags |= IR3_REG_HALF;
+               instr->regs[1]->flags |= IR3_REG_HALF;
+               break;
+       case 3:
+               /* src *= 3: */
+               immed = create_immed(block, 3);
+               immed->regs[0]->flags |= IR3_REG_HALF;
+
+               instr = ir3_MULL_U(block, instr, 0, immed, 0);
+               instr->regs[0]->flags |= IR3_REG_HALF;
+               instr->regs[1]->flags |= IR3_REG_HALF;
+               break;
+       case 4:
+               /* src *= 4 => src <<= 2: */
+               immed = create_immed(block, 2);
+               immed->regs[0]->flags |= IR3_REG_HALF;
+
+               instr = ir3_SHL_B(block, instr, 0, immed, 0);
+               instr->regs[0]->flags |= IR3_REG_HALF;
+               instr->regs[1]->flags |= IR3_REG_HALF;
+               break;
+       default:
+               unreachable("bad align");
+               return NULL;
+       }
+
+       instr = ir3_MOV(block, instr, TYPE_S16);
+       instr->regs[0]->num = regid(REG_A0, 0);
+       instr->regs[0]->flags |= IR3_REG_HALF;
+       instr->regs[1]->flags |= IR3_REG_HALF;
+
+       return instr;
+}
+
+/* caches addr values to avoid generating multiple cov/shl/mova
+ * sequences for each use of a given NIR level src as address
+ */
+static struct ir3_instruction *
+get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
+{
+       struct ir3_instruction *addr;
+       unsigned idx = align - 1;
+
+       compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
+
+       if (!ctx->addr_ht[idx]) {
+               ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
+                               _mesa_hash_pointer, _mesa_key_pointer_equal);
+       } else {
+               struct hash_entry *entry;
+               entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
+               if (entry)
+                       return entry->data;
+       }
+
+       addr = create_addr(ctx->block, src, align);
+       _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
+
+       return addr;
+}
+
+static struct ir3_instruction *
+get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *cond;
+
+       /* NOTE: only cmps.*.* can write p0.x: */
+       cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+       cond->cat2.condition = IR3_COND_NE;
+
+       /* condition always goes in predicate register: */
+       cond->regs[0]->num = regid(REG_P0, 0);
+
+       return cond;
+}
+
+static struct ir3_instruction *
+create_uniform(struct ir3_context *ctx, unsigned n)
+{
+       struct ir3_instruction *mov;
+
+       mov = ir3_instr_create(ctx->block, OPC_MOV);
+       /* TODO get types right? */
+       mov->cat1.src_type = TYPE_F32;
+       mov->cat1.dst_type = TYPE_F32;
+       ir3_reg_create(mov, 0, 0);
+       ir3_reg_create(mov, n, IR3_REG_CONST);
+
+       return mov;
+}
+
+static struct ir3_instruction *
+create_uniform_indirect(struct ir3_context *ctx, int n,
+               struct ir3_instruction *address)
+{
+       struct ir3_instruction *mov;
+
+       mov = ir3_instr_create(ctx->block, OPC_MOV);
+       mov->cat1.src_type = TYPE_U32;
+       mov->cat1.dst_type = TYPE_U32;
+       ir3_reg_create(mov, 0, 0);
+       ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
+
+       ir3_instr_set_address(mov, address);
+
+       return mov;
+}
+
+static struct ir3_instruction *
+create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
+               unsigned arrsz)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *collect;
+
+       if (arrsz == 0)
+               return NULL;
+
+       unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
+
+       collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
+       ir3_reg_create(collect, 0, flags);     /* dst */
+       for (unsigned i = 0; i < arrsz; i++) {
+               struct ir3_instruction *elem = arr[i];
+
+               /* Since arrays are pre-colored in RA, we can't assume that
+                * things will end up in the right place.  (Ie. if a collect
+                * joins elements from two different arrays.)  So insert an
+                * extra mov.
+                *
+                * We could possibly skip this if all the collected elements
+                * are contiguous elements in a single array.. not sure how
+                * likely that is to happen.
+                *
+                * Fixes a problem with glamor shaders, that in effect do
+                * something like:
+                *
+                *   if (foo)
+                *     texcoord = ..
+                *   else
+                *     texcoord = ..
+                *   color = texture2D(tex, texcoord);
+                *
+                * In this case, texcoord will end up as nir registers (which
+                * translate to ir3 array's of length 1.  And we can't assume
+                * the two (or more) arrays will get allocated in consecutive
+                * scalar registers.
+                *
+                */
+               if (elem->regs[0]->flags & IR3_REG_ARRAY) {
+                       type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+                       elem = ir3_MOV(block, elem, type);
+               }
+
+               compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
+               ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
+       }
+
+       return collect;
+}
+
+static struct ir3_instruction *
+create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
+               struct ir3_instruction *address, struct ir3_instruction *collect)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *mov;
+       struct ir3_register *src;
+
+       mov = ir3_instr_create(block, OPC_MOV);
+       mov->cat1.src_type = TYPE_U32;
+       mov->cat1.dst_type = TYPE_U32;
+       ir3_reg_create(mov, 0, 0);
+       src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
+       src->instr = collect;
+       src->size  = arrsz;
+       src->array.offset = n;
+
+       ir3_instr_set_address(mov, address);
+
+       return mov;
+}
+
+static struct ir3_instruction *
+create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
+{
+       struct ir3_instruction *in;
+
+       in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
+       in->inout.block = ctx->in_block;
+       ir3_reg_create(in, n, 0);
+
+       in->regs[0]->wrmask = compmask;
+
+       return in;
+}
+
+static struct ir3_instruction *
+create_input(struct ir3_context *ctx, unsigned n)
+{
+       return create_input_compmask(ctx, n, 0x1);
+}
+
+static struct ir3_instruction *
+create_frag_input(struct ir3_context *ctx, bool use_ldlv)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *instr;
+       /* actual inloc is assigned and fixed up later: */
+       struct ir3_instruction *inloc = create_immed(block, 0);
+
+       if (use_ldlv) {
+               instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
+               instr->cat6.type = TYPE_U32;
+               instr->cat6.iim_val = 1;
+       } else {
+               instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
+               instr->regs[2]->wrmask = 0x3;
+       }
+
+       return instr;
+}
+
+static struct ir3_instruction *
+create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
+{
+       /* first four vec4 sysval's reserved for UBOs: */
+       /* NOTE: dp is in scalar, but there can be >4 dp components: */
+       unsigned n = ctx->so->constbase.driver_param;
+       unsigned r = regid(n + dp / 4, dp % 4);
+       return create_uniform(ctx, r);
+}
+
+/* helper for instructions that produce multiple consecutive scalar
+ * outputs which need to have a split/fanout meta instruction inserted
+ */
+static void
+split_dest(struct ir3_block *block, struct ir3_instruction **dst,
+               struct ir3_instruction *src, unsigned base, unsigned n)
+{
+       struct ir3_instruction *prev = NULL;
+
+       if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
+               dst[0] = src;
+               return;
+       }
+
+       for (int i = 0, j = 0; i < n; i++) {
+               struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
+               ir3_reg_create(split, 0, IR3_REG_SSA);
+               ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
+               split->fo.off = i + base;
+
+               if (prev) {
+                       split->cp.left = prev;
+                       split->cp.left_cnt++;
+                       prev->cp.right = split;
+                       prev->cp.right_cnt++;
+               }
+               prev = split;
+
+               if (src->regs[0]->wrmask & (1 << (i + base)))
+                       dst[j++] = split;
+       }
+}
+
+/*
+ * Adreno uses uint rather than having dedicated bool type,
+ * which (potentially) requires some conversion, in particular
+ * when using output of an bool instr to int input, or visa
+ * versa.
+ *
+ *         | Adreno  |  NIR  |
+ *  -------+---------+-------+-
+ *   true  |    1    |  ~0   |
+ *   false |    0    |   0   |
+ *
+ * To convert from an adreno bool (uint) to nir, use:
+ *
+ *    absneg.s dst, (neg)src
+ *
+ * To convert back in the other direction:
+ *
+ *    absneg.s dst, (abs)arc
+ *
+ * The CP step can clean up the absneg.s that cancel each other
+ * out, and with a slight bit of extra cleverness (to recognize
+ * the instructions which produce either a 0 or 1) can eliminate
+ * the absneg.s's completely when an instruction that wants
+ * 0/1 consumes the result.  For example, when a nir 'bcsel'
+ * consumes the result of 'feq'.  So we should be able to get by
+ * without a boolean resolve step, and without incuring any
+ * extra penalty in instruction count.
+ */
+
+/* NIR bool -> native (adreno): */
+static struct ir3_instruction *
+ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
+{
+       return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
+}
+
+/* native (adreno) -> NIR bool: */
+static struct ir3_instruction *
+ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
+{
+       return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
+}
+
+/*
+ * alu/sfu instructions:
+ */
+
+static struct ir3_instruction *
+create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
+               unsigned src_bitsize, nir_op op)
+{
+       type_t src_type, dst_type;
+
+       switch (op) {
+       case nir_op_f2f32:
+       case nir_op_f2f16_rtne:
+       case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+       case nir_op_f2i32:
+       case nir_op_f2i16:
+       case nir_op_f2i8:
+       case nir_op_f2u32:
+       case nir_op_f2u16:
+       case nir_op_f2u8:
+               switch (src_bitsize) {
+               case 32:
+                       src_type = TYPE_F32;
+                       break;
+               case 16:
+                       src_type = TYPE_F16;
+                       break;
+               default:
+                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+               }
+               break;
+
+       case nir_op_i2f32:
+       case nir_op_i2f16:
+       case nir_op_i2i32:
+       case nir_op_i2i16:
+       case nir_op_i2i8:
+               switch (src_bitsize) {
+               case 32:
+                       src_type = TYPE_S32;
+                       break;
+               case 16:
+                       src_type = TYPE_S16;
+                       break;
+               case 8:
+                       src_type = TYPE_S8;
+                       break;
+               default:
+                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+               }
+               break;
+
+       case nir_op_u2f32:
+       case nir_op_u2f16:
+       case nir_op_u2u32:
+       case nir_op_u2u16:
+       case nir_op_u2u8:
+               switch (src_bitsize) {
+               case 32:
+                       src_type = TYPE_U32;
+                       break;
+               case 16:
+                       src_type = TYPE_U16;
+                       break;
+               case 8:
+                       src_type = TYPE_U8;
+                       break;
+               default:
+                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+               }
+               break;
+
+       default:
+               compile_error(ctx, "invalid conversion op: %u", op);
+       }
+
+       switch (op) {
+       case nir_op_f2f32:
+       case nir_op_i2f32:
+       case nir_op_u2f32:
+               dst_type = TYPE_F32;
+               break;
+
+       case nir_op_f2f16_rtne:
+       case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+               /* TODO how to handle rounding mode? */
+       case nir_op_i2f16:
+       case nir_op_u2f16:
+               dst_type = TYPE_F16;
+               break;
+
+       case nir_op_f2i32:
+       case nir_op_i2i32:
+               dst_type = TYPE_S32;
+               break;
+
+       case nir_op_f2i16:
+       case nir_op_i2i16:
+               dst_type = TYPE_S16;
+               break;
+
+       case nir_op_f2i8:
+       case nir_op_i2i8:
+               dst_type = TYPE_S8;
+               break;
+
+       case nir_op_f2u32:
+       case nir_op_u2u32:
+               dst_type = TYPE_U32;
+               break;
+
+       case nir_op_f2u16:
+       case nir_op_u2u16:
+               dst_type = TYPE_U16;
+               break;
+
+       case nir_op_f2u8:
+       case nir_op_u2u8:
+               dst_type = TYPE_U8;
+               break;
+
+       default:
+               compile_error(ctx, "invalid conversion op: %u", op);
+       }
+
+       return ir3_COV(ctx->block, src, src_type, dst_type);
+}
+
+static void
+emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
+{
+       const nir_op_info *info = &nir_op_infos[alu->op];
+       struct ir3_instruction **dst, *src[info->num_inputs];
+       unsigned bs[info->num_inputs];     /* bit size */
+       struct ir3_block *b = ctx->block;
+       unsigned dst_sz, wrmask;
+
+       if (alu->dest.dest.is_ssa) {
+               dst_sz = alu->dest.dest.ssa.num_components;
+               wrmask = (1 << dst_sz) - 1;
+       } else {
+               dst_sz = alu->dest.dest.reg.reg->num_components;
+               wrmask = alu->dest.write_mask;
+       }
+
+       dst = get_dst(ctx, &alu->dest.dest, dst_sz);
+
+       /* Vectors are special in that they have non-scalarized writemasks,
+        * and just take the first swizzle channel for each argument in
+        * order into each writemask channel.
+        */
+       if ((alu->op == nir_op_vec2) ||
+                       (alu->op == nir_op_vec3) ||
+                       (alu->op == nir_op_vec4)) {
+
+               for (int i = 0; i < info->num_inputs; i++) {
+                       nir_alu_src *asrc = &alu->src[i];
+
+                       compile_assert(ctx, !asrc->abs);
+                       compile_assert(ctx, !asrc->negate);
+
+                       src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
+                       if (!src[i])
+                               src[i] = create_immed(ctx->block, 0);
+                       dst[i] = ir3_MOV(b, src[i], TYPE_U32);
+               }
+
+               put_dst(ctx, &alu->dest.dest);
+               return;
+       }
+
+       /* We also get mov's with more than one component for mov's so
+        * handle those specially:
+        */
+       if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
+               type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
+               nir_alu_src *asrc = &alu->src[0];
+               struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
+
+               for (unsigned i = 0; i < dst_sz; i++) {
+                       if (wrmask & (1 << i)) {
+                               dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
+                       } else {
+                               dst[i] = NULL;
+                       }
+               }
+
+               put_dst(ctx, &alu->dest.dest);
+               return;
+       }
+
+       /* General case: We can just grab the one used channel per src. */
+       for (int i = 0; i < info->num_inputs; i++) {
+               unsigned chan = ffs(alu->dest.write_mask) - 1;
+               nir_alu_src *asrc = &alu->src[i];
+
+               compile_assert(ctx, !asrc->abs);
+               compile_assert(ctx, !asrc->negate);
+
+               src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+               bs[i] = nir_src_bit_size(asrc->src);
+
+               compile_assert(ctx, src[i]);
+       }
+
+       switch (alu->op) {
+       case nir_op_f2f32:
+       case nir_op_f2f16_rtne:
+       case nir_op_f2f16_rtz:
+       case nir_op_f2f16:
+       case nir_op_f2i32:
+       case nir_op_f2i16:
+       case nir_op_f2i8:
+       case nir_op_f2u32:
+       case nir_op_f2u16:
+       case nir_op_f2u8:
+       case nir_op_i2f32:
+       case nir_op_i2f16:
+       case nir_op_i2i32:
+       case nir_op_i2i16:
+       case nir_op_i2i8:
+       case nir_op_u2f32:
+       case nir_op_u2f16:
+       case nir_op_u2u32:
+       case nir_op_u2u16:
+       case nir_op_u2u8:
+               dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
+               break;
+       case nir_op_f2b:
+               dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
+               dst[0]->cat2.condition = IR3_COND_NE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_b2f:
+               dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
+               break;
+       case nir_op_b2i:
+               dst[0] = ir3_b2n(b, src[0]);
+               break;
+       case nir_op_i2b:
+               dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+               dst[0]->cat2.condition = IR3_COND_NE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+
+       case nir_op_fneg:
+               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
+               break;
+       case nir_op_fabs:
+               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
+               break;
+       case nir_op_fmax:
+               dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_fmin:
+               dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_fsat:
+               /* if there is just a single use of the src, and it supports
+                * (sat) bit, we can just fold the (sat) flag back to the
+                * src instruction and create a mov.  This is easier for cp
+                * to eliminate.
+                *
+                * TODO probably opc_cat==4 is ok too
+                */
+               if (alu->src[0].src.is_ssa &&
+                               (list_length(&alu->src[0].src.ssa->uses) == 1) &&
+                               ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
+                       src[0]->flags |= IR3_INSTR_SAT;
+                       dst[0] = ir3_MOV(b, src[0], TYPE_U32);
+               } else {
+                       /* otherwise generate a max.f that saturates.. blob does
+                        * similar (generating a cat2 mov using max.f)
+                        */
+                       dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
+                       dst[0]->flags |= IR3_INSTR_SAT;
+               }
+               break;
+       case nir_op_fmul:
+               dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_fadd:
+               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_fsub:
+               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
+               break;
+       case nir_op_ffma:
+               dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
+               break;
+       case nir_op_fddx:
+               dst[0] = ir3_DSX(b, src[0], 0);
+               dst[0]->cat5.type = TYPE_F32;
+               break;
+       case nir_op_fddy:
+               dst[0] = ir3_DSY(b, src[0], 0);
+               dst[0]->cat5.type = TYPE_F32;
+               break;
+               break;
+       case nir_op_flt:
+               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_LT;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_fge:
+               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_GE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_feq:
+               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_EQ;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_fne:
+               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_NE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_fceil:
+               dst[0] = ir3_CEIL_F(b, src[0], 0);
+               break;
+       case nir_op_ffloor:
+               dst[0] = ir3_FLOOR_F(b, src[0], 0);
+               break;
+       case nir_op_ftrunc:
+               dst[0] = ir3_TRUNC_F(b, src[0], 0);
+               break;
+       case nir_op_fround_even:
+               dst[0] = ir3_RNDNE_F(b, src[0], 0);
+               break;
+       case nir_op_fsign:
+               dst[0] = ir3_SIGN_F(b, src[0], 0);
+               break;
+
+       case nir_op_fsin:
+               dst[0] = ir3_SIN(b, src[0], 0);
+               break;
+       case nir_op_fcos:
+               dst[0] = ir3_COS(b, src[0], 0);
+               break;
+       case nir_op_frsq:
+               dst[0] = ir3_RSQ(b, src[0], 0);
+               break;
+       case nir_op_frcp:
+               dst[0] = ir3_RCP(b, src[0], 0);
+               break;
+       case nir_op_flog2:
+               dst[0] = ir3_LOG2(b, src[0], 0);
+               break;
+       case nir_op_fexp2:
+               dst[0] = ir3_EXP2(b, src[0], 0);
+               break;
+       case nir_op_fsqrt:
+               dst[0] = ir3_SQRT(b, src[0], 0);
+               break;
+
+       case nir_op_iabs:
+               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
+               break;
+       case nir_op_iadd:
+               dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_iand:
+               dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_imax:
+               dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_umax:
+               dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_imin:
+               dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_umin:
+               dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_imul:
+               /*
+                * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
+                *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
+                *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
+                *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
+                */
+               dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
+                                       ir3_MADSH_M16(b, src[0], 0, src[1], 0,
+                                               ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
+               break;
+       case nir_op_ineg:
+               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
+               break;
+       case nir_op_inot:
+               dst[0] = ir3_NOT_B(b, src[0], 0);
+               break;
+       case nir_op_ior:
+               dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_ishl:
+               dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_ishr:
+               dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_isign: {
+               /* maybe this would be sane to lower in nir.. */
+               struct ir3_instruction *neg, *pos;
+
+               neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+               neg->cat2.condition = IR3_COND_LT;
+
+               pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
+               pos->cat2.condition = IR3_COND_GT;
+
+               dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
+
+               break;
+       }
+       case nir_op_isub:
+               dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_ixor:
+               dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_ushr:
+               dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
+               break;
+       case nir_op_ilt:
+               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_LT;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_ige:
+               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_GE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_ieq:
+               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_EQ;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_ine:
+               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_NE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_ult:
+               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_LT;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+       case nir_op_uge:
+               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
+               dst[0]->cat2.condition = IR3_COND_GE;
+               dst[0] = ir3_n2b(b, dst[0]);
+               break;
+
+       case nir_op_bcsel: {
+               struct ir3_instruction *cond = ir3_b2n(b, src[0]);
+               compile_assert(ctx, bs[1] == bs[2]);
+               /* the boolean condition is 32b even if src[1] and src[2] are
+                * half-precision, but sel.b16 wants all three src's to be the
+                * same type.
+                */
+               if (bs[1] < 32)
+                       cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+               dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+               break;
+       }
+       case nir_op_bit_count:
+               dst[0] = ir3_CBITS_B(b, src[0], 0);
+               break;
+       case nir_op_ifind_msb: {
+               struct ir3_instruction *cmp;
+               dst[0] = ir3_CLZ_S(b, src[0], 0);
+               cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+               cmp->cat2.condition = IR3_COND_GE;
+               dst[0] = ir3_SEL_B32(b,
+                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+                               cmp, 0, dst[0], 0);
+               break;
+       }
+       case nir_op_ufind_msb:
+               dst[0] = ir3_CLZ_B(b, src[0], 0);
+               dst[0] = ir3_SEL_B32(b,
+                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+                               src[0], 0, dst[0], 0);
+               break;
+       case nir_op_find_lsb:
+               dst[0] = ir3_BFREV_B(b, src[0], 0);
+               dst[0] = ir3_CLZ_B(b, dst[0], 0);
+               break;
+       case nir_op_bitfield_reverse:
+               dst[0] = ir3_BFREV_B(b, src[0], 0);
+               break;
+
+       default:
+               compile_error(ctx, "Unhandled ALU op: %s\n",
+                               nir_op_infos[alu->op].name);
+               break;
+       }
+
+       put_dst(ctx, &alu->dest.dest);
+}
+
+/* handles direct/indirect UBO reads: */
+static void
+emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
+       nir_const_value *const_offset;
+       /* UBO addresses are the first driver params: */
+       unsigned ubo = regid(ctx->so->constbase.ubo, 0);
+       const unsigned ptrsz = pointer_size(ctx);
+
+       int off = 0;
+
+       /* First src is ubo index, which could either be an immed or not: */
+       src0 = get_src(ctx, &intr->src[0])[0];
+       if (is_same_type_mov(src0) &&
+                       (src0->regs[1]->flags & IR3_REG_IMMED)) {
+               base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
+               base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
+       } else {
+               base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
+               base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
+       }
+
+       /* note: on 32bit gpu's base_hi is ignored and DCE'd */
+       addr = base_lo;
+
+       const_offset = nir_src_as_const_value(intr->src[1]);
+       if (const_offset) {
+               off += const_offset->u32[0];
+       } else {
+               /* For load_ubo_indirect, second src is indirect offset: */
+               src1 = get_src(ctx, &intr->src[1])[0];
+
+               /* and add offset to addr: */
+               addr = ir3_ADD_S(b, addr, 0, src1, 0);
+       }
+
+       /* if offset is to large to encode in the ldg, split it out: */
+       if ((off + (intr->num_components * 4)) > 1024) {
+               /* split out the minimal amount to improve the odds that
+                * cp can fit the immediate in the add.s instruction:
+                */
+               unsigned off2 = off + (intr->num_components * 4) - 1024;
+               addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
+               off -= off2;
+       }
+
+       if (ptrsz == 2) {
+               struct ir3_instruction *carry;
+
+               /* handle 32b rollover, ie:
+                *   if (addr < base_lo)
+                *      base_hi++
+                */
+               carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
+               carry->cat2.condition = IR3_COND_LT;
+               base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
+
+               addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
+       }
+
+       for (int i = 0; i < intr->num_components; i++) {
+               struct ir3_instruction *load =
+                               ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
+               load->cat6.type = TYPE_U32;
+               load->cat6.src_offset = off + i * 4;     /* byte offset */
+               dst[i] = load;
+       }
+}
+
+/* src[] = { buffer_index, offset }. No const_index */
+static void
+emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *ldgb, *src0, *src1, *offset;
+       nir_const_value *const_offset;
+
+       /* can this be non-const buffer_index?  how do we handle that? */
+       const_offset = nir_src_as_const_value(intr->src[0]);
+       compile_assert(ctx, const_offset);
+
+       offset = get_src(ctx, &intr->src[1])[0];
+
+       /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
+       src0 = create_collect(ctx, (struct ir3_instruction*[]){
+               offset,
+               create_immed(b, 0),
+       }, 2);
+       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+
+       ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
+                       src0, 0, src1, 0);
+       ldgb->regs[0]->wrmask = MASK(intr->num_components);
+       ldgb->cat6.iim_val = intr->num_components;
+       ldgb->cat6.d = 4;
+       ldgb->cat6.type = TYPE_U32;
+       ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
+       ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
+
+       split_dest(b, dst, ldgb, 0, intr->num_components);
+}
+
+/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
+static void
+emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
+       nir_const_value *const_offset;
+       /* TODO handle wrmask properly, see _store_shared().. but I think
+        * it is more a PITA than that, since blob ends up loading the
+        * masked components and writing them back out.
+        */
+       unsigned wrmask = intr->const_index[0];
+       unsigned ncomp = ffs(~wrmask) - 1;
+
+       /* can this be non-const buffer_index?  how do we handle that? */
+       const_offset = nir_src_as_const_value(intr->src[1]);
+       compile_assert(ctx, const_offset);
+
+       offset = get_src(ctx, &intr->src[2])[0];
+
+       /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
+        * nir already *= 4:
+        */
+       src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
+       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+       src2 = create_collect(ctx, (struct ir3_instruction*[]){
+               offset,
+               create_immed(b, 0),
+       }, 2);
+
+       stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
+                       src0, 0, src1, 0, src2, 0);
+       stgb->cat6.iim_val = ncomp;
+       stgb->cat6.d = 4;
+       stgb->cat6.type = TYPE_U32;
+       stgb->barrier_class = IR3_BARRIER_BUFFER_W;
+       stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+       array_insert(b, b->keeps, stgb);
+}
+
+/* src[] = { block_index } */
+static void
+emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       /* SSBO size stored as a const starting at ssbo_sizes: */
+       unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
+       unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
+               ctx->so->const_layout.ssbo_size.off[blk_idx];
+
+       debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
+
+       dst[0] = create_uniform(ctx, idx);
+}
+
+/*
+ * SSBO atomic intrinsics
+ *
+ * All of the SSBO atomic memory operations read a value from memory,
+ * compute a new value using one of the operations below, write the new
+ * value to memory, and return the original value read.
+ *
+ * All operations take 3 sources except CompSwap that takes 4. These
+ * sources represent:
+ *
+ * 0: The SSBO buffer index.
+ * 1: The offset into the SSBO buffer of the variable that the atomic
+ *    operation will operate on.
+ * 2: The data parameter to the atomic function (i.e. the value to add
+ *    in ssbo_atomic_add, etc).
+ * 3: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
+       nir_const_value *const_offset;
+       type_t type = TYPE_U32;
+
+       /* can this be non-const buffer_index?  how do we handle that? */
+       const_offset = nir_src_as_const_value(intr->src[0]);
+       compile_assert(ctx, const_offset);
+       ssbo = create_immed(b, const_offset->u32[0]);
+
+       offset = get_src(ctx, &intr->src[1])[0];
+
+       /* src0 is data (or uvec2(data, compare))
+        * src1 is offset
+        * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
+        *
+        * Note that nir already multiplies the offset by four
+        */
+       src0 = get_src(ctx, &intr->src[2])[0];
+       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+       src2 = create_collect(ctx, (struct ir3_instruction*[]){
+               offset,
+               create_immed(b, 0),
+       }, 2);
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_ssbo_atomic_add:
+               atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_imin:
+               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_ssbo_atomic_umin:
+               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_imax:
+               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_ssbo_atomic_umax:
+               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_and:
+               atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_or:
+               atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_xor:
+               atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_exchange:
+               atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_ssbo_atomic_comp_swap:
+               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+               src0 = create_collect(ctx, (struct ir3_instruction*[]){
+                       get_src(ctx, &intr->src[3])[0],
+                       src0,
+               }, 2);
+               atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       atomic->cat6.iim_val = 1;
+       atomic->cat6.d = 4;
+       atomic->cat6.type = type;
+       atomic->barrier_class = IR3_BARRIER_BUFFER_W;
+       atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+
+       /* even if nothing consume the result, we can't DCE the instruction: */
+       array_insert(b, b->keeps, atomic);
+
+       return atomic;
+}
+
+/* src[] = { offset }. const_index[] = { base } */
+static void
+emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *ldl, *offset;
+       unsigned base;
+
+       offset = get_src(ctx, &intr->src[0])[0];
+       base   = nir_intrinsic_base(intr);
+
+       ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
+       ldl->cat6.src_offset = base;
+       ldl->cat6.type = utype_dst(intr->dest);
+       ldl->regs[0]->wrmask = MASK(intr->num_components);
+
+       ldl->barrier_class = IR3_BARRIER_SHARED_R;
+       ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
+
+       split_dest(b, dst, ldl, 0, intr->num_components);
+}
+
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+static void
+emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *stl, *offset;
+       struct ir3_instruction * const *value;
+       unsigned base, wrmask;
+
+       value  = get_src(ctx, &intr->src[0]);
+       offset = get_src(ctx, &intr->src[1])[0];
+
+       base   = nir_intrinsic_base(intr);
+       wrmask = nir_intrinsic_write_mask(intr);
+
+       /* Combine groups of consecutive enabled channels in one write
+        * message. We use ffs to find the first enabled channel and then ffs on
+        * the bit-inverse, down-shifted writemask to determine the length of
+        * the block of enabled bits.
+        *
+        * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
+        */
+       while (wrmask) {
+               unsigned first_component = ffs(wrmask) - 1;
+               unsigned length = ffs(~(wrmask >> first_component)) - 1;
+
+               stl = ir3_STL(b, offset, 0,
+                       create_collect(ctx, &value[first_component], length), 0,
+                       create_immed(b, length), 0);
+               stl->cat6.dst_offset = first_component + base;
+               stl->cat6.type = utype_src(intr->src[0]);
+               stl->barrier_class = IR3_BARRIER_SHARED_W;
+               stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+               array_insert(b, b->keeps, stl);
+
+               /* Clear the bits in the writemask that we just wrote, then try
+                * again to see if more channels are left.
+                */
+               wrmask &= (15 << (first_component + length));
+       }
+}
+
+/*
+ * CS shared variable atomic intrinsics
+ *
+ * All of the shared variable atomic memory operations read a value from
+ * memory, compute a new value using one of the operations below, write the
+ * new value to memory, and return the original value read.
+ *
+ * All operations take 2 sources except CompSwap that takes 3. These
+ * sources represent:
+ *
+ * 0: The offset into the shared variable storage region that the atomic
+ *    operation will operate on.
+ * 1: The data parameter to the atomic function (i.e. the value to add
+ *    in shared_atomic_add, etc).
+ * 2: For CompSwap only: the second data parameter.
+ */
+static struct ir3_instruction *
+emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *atomic, *src0, *src1;
+       type_t type = TYPE_U32;
+
+       src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
+       src1 = get_src(ctx, &intr->src[1])[0];   /* value */
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_shared_atomic_add:
+               atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_imin:
+               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_shared_atomic_umin:
+               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_imax:
+               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+               type = TYPE_S32;
+               break;
+       case nir_intrinsic_shared_atomic_umax:
+               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_and:
+               atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_or:
+               atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_xor:
+               atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_exchange:
+               atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
+               break;
+       case nir_intrinsic_shared_atomic_comp_swap:
+               /* for cmpxchg, src1 is [ui]vec2(data, compare): */
+               src1 = create_collect(ctx, (struct ir3_instruction*[]){
+                       get_src(ctx, &intr->src[2])[0],
+                       src1,
+               }, 2);
+               atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       atomic->cat6.iim_val = 1;
+       atomic->cat6.d = 1;
+       atomic->cat6.type = type;
+       atomic->barrier_class = IR3_BARRIER_SHARED_W;
+       atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
+
+       /* even if nothing consume the result, we can't DCE the instruction: */
+       array_insert(b, b->keeps, atomic);
+
+       return atomic;
+}
+
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load).  To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc.  This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
+ * coded.  Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
+{
+       unsigned int loc = 0;
+       unsigned inner_size = 1;
+
+       while (deref->deref_type != nir_deref_type_var) {
+               assert(deref->deref_type == nir_deref_type_array);
+               nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+               assert(const_index);
+
+               /* Go to the next instruction */
+               deref = nir_deref_instr_parent(deref);
+
+               assert(glsl_type_is_array(deref->type));
+               const unsigned array_len = glsl_get_length(deref->type);
+               loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
+
+               /* Update the inner size */
+               inner_size *= array_len;
+       }
+
+       loc += deref->var->data.driver_location;
+
+       /* TODO figure out real limit per generation, and don't hardcode: */
+       const unsigned max_samplers = 16;
+       return max_samplers - loc - 1;
+}
+
+/* see tex_info() for equiv logic for texture instructions.. it would be
+ * nice if this could be better unified..
+ */
+static unsigned
+get_image_coords(const nir_variable *var, unsigned *flagsp)
+{
+       const struct glsl_type *type = glsl_without_array(var->type);
+       unsigned coords, flags = 0;
+
+       switch (glsl_get_sampler_dim(type)) {
+       case GLSL_SAMPLER_DIM_1D:
+       case GLSL_SAMPLER_DIM_BUF:
+               coords = 1;
+               break;
+       case GLSL_SAMPLER_DIM_2D:
+       case GLSL_SAMPLER_DIM_RECT:
+       case GLSL_SAMPLER_DIM_EXTERNAL:
+       case GLSL_SAMPLER_DIM_MS:
+               coords = 2;
+               break;
+       case GLSL_SAMPLER_DIM_3D:
+       case GLSL_SAMPLER_DIM_CUBE:
+               flags |= IR3_INSTR_3D;
+               coords = 3;
+               break;
+       default:
+               unreachable("bad sampler dim");
+               return 0;
+       }
+
+       if (glsl_sampler_type_is_array(type)) {
+               /* note: unlike tex_info(), adjust # of coords to include array idx: */
+               coords++;
+               flags |= IR3_INSTR_A;
+       }
+
+       if (flagsp)
+               *flagsp = flags;
+
+       return coords;
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+       switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+       case GLSL_TYPE_UINT:
+               return TYPE_U32;
+       case GLSL_TYPE_INT:
+               return TYPE_S32;
+       case GLSL_TYPE_FLOAT:
+               return TYPE_F32;
+       default:
+               unreachable("bad sampler type.");
+               return 0;
+       }
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+               struct ir3_instruction * const *coords, bool byteoff)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *offset;
+       unsigned ncoords = get_image_coords(var, NULL);
+
+       /* to calculate the byte offset (yes, uggg) we need (up to) three
+        * const values to know the bytes per pixel, and y and z stride:
+        */
+       unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+               ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+       debug_assert(ctx->so->const_layout.image_dims.mask &
+                       (1 << var->data.driver_location));
+
+       /* offset = coords.x * bytes_per_pixel: */
+       offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+       if (ncoords > 1) {
+               /* offset += coords.y * y_pitch: */
+               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+                               coords[1], 0, offset, 0);
+       }
+       if (ncoords > 2) {
+               /* offset += coords.z * z_pitch: */
+               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+                               coords[2], 0, offset, 0);
+       }
+
+       if (!byteoff) {
+               /* Some cases, like atomics, seem to use dword offset instead
+                * of byte offsets.. blob just puts an extra shr.b in there
+                * in those cases:
+                */
+               offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+       }
+
+       return create_collect(ctx, (struct ir3_instruction*[]){
+               offset,
+               create_immed(b, 0),
+       }, 2);
+}
+
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+       struct ir3_instruction *sam;
+       struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
+       struct ir3_instruction *coords[4];
+       unsigned flags, ncoords = get_image_coords(var, &flags);
+       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+       type_t type = get_image_type(var);
+
+       /* hmm, this seems a bit odd, but it is what blob does and (at least
+        * a5xx) just faults on bogus addresses otherwise:
+        */
+       if (flags & IR3_INSTR_3D) {
+               flags &= ~IR3_INSTR_3D;
+               flags |= IR3_INSTR_A;
+       }
+
+       for (unsigned i = 0; i < ncoords; i++)
+               coords[i] = src0[i];
+
+       if (ncoords == 1)
+               coords[ncoords++] = create_immed(b, 0);
+
+       sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
+                       tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
+
+       sam->barrier_class = IR3_BARRIER_IMAGE_R;
+       sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+       split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+       struct ir3_instruction *stib, *offset;
+       struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
+       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+       unsigned ncoords = get_image_coords(var, NULL);
+       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+
+       /* src0 is value
+        * src1 is coords
+        * src2 is 64b byte offset
+        */
+
+       offset = get_image_offset(ctx, var, coords, true);
+
+       /* NOTE: stib seems to take byte offset, but stgb.typed can be used
+        * too and takes a dword offset.. not quite sure yet why blob uses
+        * one over the other in various cases.
+        */
+
+       stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+                       create_collect(ctx, value, 4), 0,
+                       create_collect(ctx, coords, ncoords), 0,
+                       offset, 0);
+       stib->cat6.iim_val = 4;
+       stib->cat6.d = ncoords;
+       stib->cat6.type = get_image_type(var);
+       stib->cat6.typed = true;
+       stib->barrier_class = IR3_BARRIER_IMAGE_W;
+       stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+       array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
+       struct ir3_instruction *sam, *lod;
+       unsigned flags, ncoords = get_image_coords(var, &flags);
+
+       lod = create_immed(b, 0);
+       sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+                       tex_idx, tex_idx, lod, NULL);
+
+       /* Array size actually ends up in .w rather than .z. This doesn't
+        * matter for miplevel 0, but for higher mips the value in z is
+        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+        * returned, which means that we have to add 1 to it for arrays for
+        * a3xx.
+        *
+        * Note use a temporary dst and then copy, since the size of the dst
+        * array that is passed in is based on nir's understanding of the
+        * result size, not the hardware's
+        */
+       struct ir3_instruction *tmp[4];
+
+       split_dest(b, tmp, sam, 0, 4);
+
+       /* get_size instruction returns size in bytes instead of texels
+        * for imageBuffer, so we need to divide it by the pixel size
+        * of the image format.
+        *
+        * TODO: This is at least true on a5xx. Check other gens.
+        */
+       enum glsl_sampler_dim dim =
+               glsl_get_sampler_dim(glsl_without_array(var->type));
+       if (dim == GLSL_SAMPLER_DIM_BUF) {
+               /* Since all the possible values the divisor can take are
+                * power-of-two (4, 8, or 16), the division is implemented
+                * as a shift-right.
+                * During shader setup, the log2 of the image format's
+                * bytes-per-pixel should have been emitted in 2nd slot of
+                * image_dims. See ir3_shader::emit_image_dims().
+                */
+               unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+                       ctx->so->const_layout.image_dims.off[var->data.driver_location];
+               struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
+
+               tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
+       }
+
+       for (unsigned i = 0; i < ncoords; i++)
+               dst[i] = tmp[i];
+
+       if (flags & IR3_INSTR_A) {
+               if (ctx->compiler->levels_add_one) {
+                       dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+               } else {
+                       dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
+               }
+       }
+}
+
+/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+       struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+       unsigned ncoords = get_image_coords(var, NULL);
+
+       image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
+
+       /* src0 is value (or uvec2(value, compare))
+        * src1 is coords
+        * src2 is 64b byte offset
+        */
+       src0 = get_src(ctx, &intr->src[3])[0];
+       src1 = create_collect(ctx, coords, ncoords);
+       src2 = get_image_offset(ctx, var, coords, false);
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_image_deref_atomic_add:
+               atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_min:
+               atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_max:
+               atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_and:
+               atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_or:
+               atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_xor:
+               atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_exchange:
+               atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_deref_atomic_comp_swap:
+               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+               src0 = create_collect(ctx, (struct ir3_instruction*[]){
+                       get_src(ctx, &intr->src[4])[0],
+                       src0,
+               }, 2);
+               atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       atomic->cat6.iim_val = 1;
+       atomic->cat6.d = ncoords;
+       atomic->cat6.type = get_image_type(var);
+       atomic->cat6.typed = true;
+       atomic->barrier_class = IR3_BARRIER_IMAGE_W;
+       atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
+
+       /* even if nothing consume the result, we can't DCE the instruction: */
+       array_insert(b, b->keeps, atomic);
+
+       return atomic;
+}
+
+static void
+emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *barrier;
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_barrier:
+               barrier = ir3_BAR(b);
+               barrier->cat7.g = true;
+               barrier->cat7.l = true;
+               barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+               barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+               break;
+       case nir_intrinsic_memory_barrier:
+               barrier = ir3_FENCE(b);
+               barrier->cat7.g = true;
+               barrier->cat7.r = true;
+               barrier->cat7.w = true;
+               barrier->barrier_class = IR3_BARRIER_IMAGE_W |
+                               IR3_BARRIER_BUFFER_W;
+               barrier->barrier_conflict =
+                               IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+               break;
+       case nir_intrinsic_memory_barrier_atomic_counter:
+       case nir_intrinsic_memory_barrier_buffer:
+               barrier = ir3_FENCE(b);
+               barrier->cat7.g = true;
+               barrier->cat7.r = true;
+               barrier->cat7.w = true;
+               barrier->barrier_class = IR3_BARRIER_BUFFER_W;
+               barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
+                               IR3_BARRIER_BUFFER_W;
+               break;
+       case nir_intrinsic_memory_barrier_image:
+               // TODO double check if this should have .g set
+               barrier = ir3_FENCE(b);
+               barrier->cat7.g = true;
+               barrier->cat7.r = true;
+               barrier->cat7.w = true;
+               barrier->barrier_class = IR3_BARRIER_IMAGE_W;
+               barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
+                               IR3_BARRIER_IMAGE_W;
+               break;
+       case nir_intrinsic_memory_barrier_shared:
+               barrier = ir3_FENCE(b);
+               barrier->cat7.g = true;
+               barrier->cat7.l = true;
+               barrier->cat7.r = true;
+               barrier->cat7.w = true;
+               barrier->barrier_class = IR3_BARRIER_SHARED_W;
+               barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
+                               IR3_BARRIER_SHARED_W;
+               break;
+       case nir_intrinsic_group_memory_barrier:
+               barrier = ir3_FENCE(b);
+               barrier->cat7.g = true;
+               barrier->cat7.l = true;
+               barrier->cat7.r = true;
+               barrier->cat7.w = true;
+               barrier->barrier_class = IR3_BARRIER_SHARED_W |
+                               IR3_BARRIER_IMAGE_W |
+                               IR3_BARRIER_BUFFER_W;
+               barrier->barrier_conflict =
+                               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
+                               IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
+                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       /* make sure barrier doesn't get DCE'd */
+       array_insert(b, b->keeps, barrier);
+}
+
+static void add_sysval_input_compmask(struct ir3_context *ctx,
+               gl_system_value slot, unsigned compmask,
+               struct ir3_instruction *instr)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned r = regid(so->inputs_count, 0);
+       unsigned n = so->inputs_count++;
+
+       so->inputs[n].sysval = true;
+       so->inputs[n].slot = slot;
+       so->inputs[n].compmask = compmask;
+       so->inputs[n].regid = r;
+       so->inputs[n].interpolate = INTERP_MODE_FLAT;
+       so->total_in++;
+
+       ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+       ctx->ir->inputs[r] = instr;
+}
+
+static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
+               struct ir3_instruction *instr)
+{
+       add_sysval_input_compmask(ctx, slot, 0x1, instr);
+}
+
+static void
+emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
+       struct ir3_instruction **dst;
+       struct ir3_instruction * const *src;
+       struct ir3_block *b = ctx->block;
+       nir_const_value *const_offset;
+       int idx, comp;
+
+       if (info->has_dest) {
+               unsigned n = nir_intrinsic_dest_components(intr);
+               dst = get_dst(ctx, &intr->dest, n);
+       } else {
+               dst = NULL;
+       }
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_load_uniform:
+               idx = nir_intrinsic_base(intr);
+               const_offset = nir_src_as_const_value(intr->src[0]);
+               if (const_offset) {
+                       idx += const_offset->u32[0];
+                       for (int i = 0; i < intr->num_components; i++) {
+                               unsigned n = idx * 4 + i;
+                               dst[i] = create_uniform(ctx, n);
+                       }
+               } else {
+                       src = get_src(ctx, &intr->src[0]);
+                       for (int i = 0; i < intr->num_components; i++) {
+                               int n = idx * 4 + i;
+                               dst[i] = create_uniform_indirect(ctx, n,
+                                               get_addr(ctx, src[0], 4));
+                       }
+                       /* NOTE: if relative addressing is used, we set
+                        * constlen in the compiler (to worst-case value)
+                        * since we don't know in the assembler what the max
+                        * addr reg value can be:
+                        */
+                       ctx->so->constlen = ctx->s->num_uniforms;
+               }
+               break;
+       case nir_intrinsic_load_ubo:
+               emit_intrinsic_load_ubo(ctx, intr, dst);
+               break;
+       case nir_intrinsic_load_input:
+               idx = nir_intrinsic_base(intr);
+               comp = nir_intrinsic_component(intr);
+               const_offset = nir_src_as_const_value(intr->src[0]);
+               if (const_offset) {
+                       idx += const_offset->u32[0];
+                       for (int i = 0; i < intr->num_components; i++) {
+                               unsigned n = idx * 4 + i + comp;
+                               dst[i] = ctx->ir->inputs[n];
+                       }
+               } else {
+                       src = get_src(ctx, &intr->src[0]);
+                       struct ir3_instruction *collect =
+                                       create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
+                       struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
+                       for (int i = 0; i < intr->num_components; i++) {
+                               unsigned n = idx * 4 + i + comp;
+                               dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+                                               n, addr, collect);
+                       }
+               }
+               break;
+       case nir_intrinsic_load_ssbo:
+               emit_intrinsic_load_ssbo(ctx, intr, dst);
+               break;
+       case nir_intrinsic_store_ssbo:
+               emit_intrinsic_store_ssbo(ctx, intr);
+               break;
+       case nir_intrinsic_get_buffer_size:
+               emit_intrinsic_ssbo_size(ctx, intr, dst);
+               break;
+       case nir_intrinsic_ssbo_atomic_add:
+       case nir_intrinsic_ssbo_atomic_imin:
+       case nir_intrinsic_ssbo_atomic_umin:
+       case nir_intrinsic_ssbo_atomic_imax:
+       case nir_intrinsic_ssbo_atomic_umax:
+       case nir_intrinsic_ssbo_atomic_and:
+       case nir_intrinsic_ssbo_atomic_or:
+       case nir_intrinsic_ssbo_atomic_xor:
+       case nir_intrinsic_ssbo_atomic_exchange:
+       case nir_intrinsic_ssbo_atomic_comp_swap:
+               dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
+               break;
+       case nir_intrinsic_load_shared:
+               emit_intrinsic_load_shared(ctx, intr, dst);
+               break;
+       case nir_intrinsic_store_shared:
+               emit_intrinsic_store_shared(ctx, intr);
+               break;
+       case nir_intrinsic_shared_atomic_add:
+       case nir_intrinsic_shared_atomic_imin:
+       case nir_intrinsic_shared_atomic_umin:
+       case nir_intrinsic_shared_atomic_imax:
+       case nir_intrinsic_shared_atomic_umax:
+       case nir_intrinsic_shared_atomic_and:
+       case nir_intrinsic_shared_atomic_or:
+       case nir_intrinsic_shared_atomic_xor:
+       case nir_intrinsic_shared_atomic_exchange:
+       case nir_intrinsic_shared_atomic_comp_swap:
+               dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
+               break;
+       case nir_intrinsic_image_deref_load:
+               emit_intrinsic_load_image(ctx, intr, dst);
+               break;
+       case nir_intrinsic_image_deref_store:
+               emit_intrinsic_store_image(ctx, intr);
+               break;
+       case nir_intrinsic_image_deref_size:
+               emit_intrinsic_image_size(ctx, intr, dst);
+               break;
+       case nir_intrinsic_image_deref_atomic_add:
+       case nir_intrinsic_image_deref_atomic_min:
+       case nir_intrinsic_image_deref_atomic_max:
+       case nir_intrinsic_image_deref_atomic_and:
+       case nir_intrinsic_image_deref_atomic_or:
+       case nir_intrinsic_image_deref_atomic_xor:
+       case nir_intrinsic_image_deref_atomic_exchange:
+       case nir_intrinsic_image_deref_atomic_comp_swap:
+               dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+               break;
+       case nir_intrinsic_barrier:
+       case nir_intrinsic_memory_barrier:
+       case nir_intrinsic_group_memory_barrier:
+       case nir_intrinsic_memory_barrier_atomic_counter:
+       case nir_intrinsic_memory_barrier_buffer:
+       case nir_intrinsic_memory_barrier_image:
+       case nir_intrinsic_memory_barrier_shared:
+               emit_intrinsic_barrier(ctx, intr);
+               /* note that blk ptr no longer valid, make that obvious: */
+               b = NULL;
+               break;
+       case nir_intrinsic_store_output:
+               idx = nir_intrinsic_base(intr);
+               comp = nir_intrinsic_component(intr);
+               const_offset = nir_src_as_const_value(intr->src[1]);
+               compile_assert(ctx, const_offset != NULL);
+               idx += const_offset->u32[0];
+
+               src = get_src(ctx, &intr->src[0]);
+               for (int i = 0; i < intr->num_components; i++) {
+                       unsigned n = idx * 4 + i + comp;
+                       ctx->ir->outputs[n] = src[i];
+               }
+               break;
+       case nir_intrinsic_load_base_vertex:
+       case nir_intrinsic_load_first_vertex:
+               if (!ctx->basevertex) {
+                       ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
+                       add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
+               }
+               dst[0] = ctx->basevertex;
+               break;
+       case nir_intrinsic_load_vertex_id_zero_base:
+       case nir_intrinsic_load_vertex_id:
+               if (!ctx->vertex_id) {
+                       gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
+                               SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
+                       ctx->vertex_id = create_input(ctx, 0);
+                       add_sysval_input(ctx, sv, ctx->vertex_id);
+               }
+               dst[0] = ctx->vertex_id;
+               break;
+       case nir_intrinsic_load_instance_id:
+               if (!ctx->instance_id) {
+                       ctx->instance_id = create_input(ctx, 0);
+                       add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
+                                       ctx->instance_id);
+               }
+               dst[0] = ctx->instance_id;
+               break;
+       case nir_intrinsic_load_sample_id:
+       case nir_intrinsic_load_sample_id_no_per_sample:
+               if (!ctx->samp_id) {
+                       ctx->samp_id = create_input(ctx, 0);
+                       ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
+                       add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
+                                       ctx->samp_id);
+               }
+               dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+               break;
+       case nir_intrinsic_load_sample_mask_in:
+               if (!ctx->samp_mask_in) {
+                       ctx->samp_mask_in = create_input(ctx, 0);
+                       add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
+                                       ctx->samp_mask_in);
+               }
+               dst[0] = ctx->samp_mask_in;
+               break;
+       case nir_intrinsic_load_user_clip_plane:
+               idx = nir_intrinsic_ucp_id(intr);
+               for (int i = 0; i < intr->num_components; i++) {
+                       unsigned n = idx * 4 + i;
+                       dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+               }
+               break;
+       case nir_intrinsic_load_front_face:
+               if (!ctx->frag_face) {
+                       ctx->so->frag_face = true;
+                       ctx->frag_face = create_input(ctx, 0);
+                       add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
+                       ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+               }
+               /* for fragface, we get -1 for back and 0 for front. However this is
+                * the inverse of what nir expects (where ~0 is true).
+                */
+               dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+               dst[0] = ir3_NOT_B(b, dst[0], 0);
+               break;
+       case nir_intrinsic_load_local_invocation_id:
+               if (!ctx->local_invocation_id) {
+                       ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
+                       add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
+                                       0x7, ctx->local_invocation_id);
+               }
+               split_dest(b, dst, ctx->local_invocation_id, 0, 3);
+               break;
+       case nir_intrinsic_load_work_group_id:
+               if (!ctx->work_group_id) {
+                       ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
+                       add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
+                                       0x7, ctx->work_group_id);
+                       ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
+               }
+               split_dest(b, dst, ctx->work_group_id, 0, 3);
+               break;
+       case nir_intrinsic_load_num_work_groups:
+               for (int i = 0; i < intr->num_components; i++) {
+                       dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
+               }
+               break;
+       case nir_intrinsic_load_local_group_size:
+               for (int i = 0; i < intr->num_components; i++) {
+                       dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
+               }
+               break;
+       case nir_intrinsic_discard_if:
+       case nir_intrinsic_discard: {
+               struct ir3_instruction *cond, *kill;
+
+               if (intr->intrinsic == nir_intrinsic_discard_if) {
+                       /* conditional discard: */
+                       src = get_src(ctx, &intr->src[0]);
+                       cond = ir3_b2n(b, src[0]);
+               } else {
+                       /* unconditional discard: */
+                       cond = create_immed(b, 1);
+               }
+
+               /* NOTE: only cmps.*.* can write p0.x: */
+               cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+               cond->cat2.condition = IR3_COND_NE;
+
+               /* condition always goes in predicate register: */
+               cond->regs[0]->num = regid(REG_P0, 0);
+
+               kill = ir3_KILL(b, cond, 0);
+               array_insert(ctx->ir, ctx->ir->predicates, kill);
+
+               array_insert(b, b->keeps, kill);
+               ctx->so->has_kill = true;
+
+               break;
+       }
+       default:
+               compile_error(ctx, "Unhandled intrinsic type: %s\n",
+                               nir_intrinsic_infos[intr->intrinsic].name);
+               break;
+       }
+
+       if (info->has_dest)
+               put_dst(ctx, &intr->dest);
+}
+
+static void
+emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
+{
+       struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
+                       instr->def.num_components);
+       type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+       for (int i = 0; i < instr->def.num_components; i++)
+               dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
+}
+
+static void
+emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
+{
+       struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
+                       undef->def.num_components);
+       type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
+
+       /* backend doesn't want undefined instructions, so just plug
+        * in 0.0..
+        */
+       for (int i = 0; i < undef->def.num_components; i++)
+               dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
+}
+
+/*
+ * texture fetch/sample instructions:
+ */
+
+static void
+tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
+{
+       unsigned coords, flags = 0;
+
+       /* note: would use tex->coord_components.. except txs.. also,
+        * since array index goes after shadow ref, we don't want to
+        * count it:
+        */
+       switch (tex->sampler_dim) {
+       case GLSL_SAMPLER_DIM_1D:
+       case GLSL_SAMPLER_DIM_BUF:
+               coords = 1;
+               break;
+       case GLSL_SAMPLER_DIM_2D:
+       case GLSL_SAMPLER_DIM_RECT:
+       case GLSL_SAMPLER_DIM_EXTERNAL:
+       case GLSL_SAMPLER_DIM_MS:
+               coords = 2;
+               break;
+       case GLSL_SAMPLER_DIM_3D:
+       case GLSL_SAMPLER_DIM_CUBE:
+               coords = 3;
+               flags |= IR3_INSTR_3D;
+               break;
+       default:
+               unreachable("bad sampler_dim");
+       }
+
+       if (tex->is_shadow && tex->op != nir_texop_lod)
+               flags |= IR3_INSTR_S;
+
+       if (tex->is_array && tex->op != nir_texop_lod)
+               flags |= IR3_INSTR_A;
+
+       *flagsp = flags;
+       *coordsp = coords;
+}
+
+static void
+emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
+       struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
+       struct ir3_instruction *lod, *compare, *proj, *sample_index;
+       bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
+       unsigned i, coords, flags;
+       unsigned nsrc0 = 0, nsrc1 = 0;
+       type_t type;
+       opc_t opc = 0;
+
+       coord = off = ddx = ddy = NULL;
+       lod = proj = compare = sample_index = NULL;
+
+       /* TODO: might just be one component for gathers? */
+       dst = get_dst(ctx, &tex->dest, 4);
+
+       for (unsigned i = 0; i < tex->num_srcs; i++) {
+               switch (tex->src[i].src_type) {
+               case nir_tex_src_coord:
+                       coord = get_src(ctx, &tex->src[i].src);
+                       break;
+               case nir_tex_src_bias:
+                       lod = get_src(ctx, &tex->src[i].src)[0];
+                       has_bias = true;
+                       break;
+               case nir_tex_src_lod:
+                       lod = get_src(ctx, &tex->src[i].src)[0];
+                       has_lod = true;
+                       break;
+               case nir_tex_src_comparator: /* shadow comparator */
+                       compare = get_src(ctx, &tex->src[i].src)[0];
+                       break;
+               case nir_tex_src_projector:
+                       proj = get_src(ctx, &tex->src[i].src)[0];
+                       has_proj = true;
+                       break;
+               case nir_tex_src_offset:
+                       off = get_src(ctx, &tex->src[i].src);
+                       has_off = true;
+                       break;
+               case nir_tex_src_ddx:
+                       ddx = get_src(ctx, &tex->src[i].src);
+                       break;
+               case nir_tex_src_ddy:
+                       ddy = get_src(ctx, &tex->src[i].src);
+                       break;
+               case nir_tex_src_ms_index:
+                       sample_index = get_src(ctx, &tex->src[i].src)[0];
+                       break;
+               default:
+                       compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+                                       tex->src[i].src_type);
+                       return;
+               }
+       }
+
+       switch (tex->op) {
+       case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
+       case nir_texop_txb:      opc = OPC_SAMB;     break;
+       case nir_texop_txl:      opc = OPC_SAML;     break;
+       case nir_texop_txd:      opc = OPC_SAMGQ;    break;
+       case nir_texop_txf:      opc = OPC_ISAML;    break;
+       case nir_texop_lod:      opc = OPC_GETLOD;   break;
+       case nir_texop_tg4:
+               /* NOTE: a4xx might need to emulate gather w/ txf (this is
+                * what blob does, seems gather  is broken?), and a3xx did
+                * not support it (but probably could also emulate).
+                */
+               switch (tex->component) {
+               case 0:              opc = OPC_GATHER4R; break;
+               case 1:              opc = OPC_GATHER4G; break;
+               case 2:              opc = OPC_GATHER4B; break;
+               case 3:              opc = OPC_GATHER4A; break;
+               }
+               break;
+       case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
+       case nir_texop_txs:
+       case nir_texop_query_levels:
+       case nir_texop_texture_samples:
+       case nir_texop_samples_identical:
+       case nir_texop_txf_ms_mcs:
+               compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
+               return;
+       }
+
+       tex_info(tex, &flags, &coords);
+
+       /*
+        * lay out the first argument in the proper order:
+        *  - actual coordinates first
+        *  - shadow reference
+        *  - array index
+        *  - projection w
+        *  - starting at offset 4, dpdx.xy, dpdy.xy
+        *
+        * bias/lod go into the second arg
+        */
+
+       /* insert tex coords: */
+       for (i = 0; i < coords; i++)
+               src0[i] = coord[i];
+
+       nsrc0 = i;
+
+       /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+        * with scaled x coord according to requested sample:
+        */
+       if (tex->op == nir_texop_txf_ms) {
+               if (ctx->compiler->txf_ms_with_isaml) {
+                       /* the samples are laid out in x dimension as
+                        *     0 1 2 3
+                        * x_ms = (x << ms) + sample_index;
+                        */
+                       struct ir3_instruction *ms;
+                       ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+                       src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+                       src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+                       opc = OPC_ISAML;
+               } else {
+                       src0[nsrc0++] = sample_index;
+               }
+       }
+
+       /* scale up integer coords for TXF based on the LOD */
+       if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
+               assert(has_lod);
+               for (i = 0; i < coords; i++)
+                       src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
+       }
+
+       if (coords == 1) {
+               /* hw doesn't do 1d, so we treat it as 2d with
+                * height of 1, and patch up the y coord.
+                * TODO: y coord should be (int)0 in some cases..
+                */
+               src0[nsrc0++] = create_immed(b, fui(0.5));
+       }
+
+       if (tex->is_shadow && tex->op != nir_texop_lod)
+               src0[nsrc0++] = compare;
+
+       if (tex->is_array && tex->op != nir_texop_lod) {
+               struct ir3_instruction *idx = coord[coords];
+
+               /* the array coord for cube arrays needs 0.5 added to it */
+               if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
+                       idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
+
+               src0[nsrc0++] = idx;
+       }
+
+       if (has_proj) {
+               src0[nsrc0++] = proj;
+               flags |= IR3_INSTR_P;
+       }
+
+       /* pad to 4, then ddx/ddy: */
+       if (tex->op == nir_texop_txd) {
+               while (nsrc0 < 4)
+                       src0[nsrc0++] = create_immed(b, fui(0.0));
+               for (i = 0; i < coords; i++)
+                       src0[nsrc0++] = ddx[i];
+               if (coords < 2)
+                       src0[nsrc0++] = create_immed(b, fui(0.0));
+               for (i = 0; i < coords; i++)
+                       src0[nsrc0++] = ddy[i];
+               if (coords < 2)
+                       src0[nsrc0++] = create_immed(b, fui(0.0));
+       }
+
+       /*
+        * second argument (if applicable):
+        *  - offsets
+        *  - lod
+        *  - bias
+        */
+       if (has_off | has_lod | has_bias) {
+               if (has_off) {
+                       unsigned off_coords = coords;
+                       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+                               off_coords--;
+                       for (i = 0; i < off_coords; i++)
+                               src1[nsrc1++] = off[i];
+                       if (off_coords < 2)
+                               src1[nsrc1++] = create_immed(b, fui(0.0));
+                       flags |= IR3_INSTR_O;
+               }
+
+               if (has_lod | has_bias)
+                       src1[nsrc1++] = lod;
+       }
+
+       switch (tex->dest_type) {
+       case nir_type_invalid:
+       case nir_type_float:
+               type = TYPE_F32;
+               break;
+       case nir_type_int:
+               type = TYPE_S32;
+               break;
+       case nir_type_uint:
+       case nir_type_bool:
+               type = TYPE_U32;
+               break;
+       default:
+               unreachable("bad dest_type");
+       }
+
+       if (opc == OPC_GETLOD)
+               type = TYPE_U32;
+
+       unsigned tex_idx = tex->texture_index;
+
+       ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
+
+       struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
+       struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
+
+       sam = ir3_SAM(b, opc, type, 0b1111, flags,
+                       tex_idx, tex_idx, col0, col1);
+
+       if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
+               /* only need first 3 components: */
+               sam->regs[0]->wrmask = 0x7;
+               split_dest(b, dst, sam, 0, 3);
+
+               /* we need to sample the alpha separately with a non-ASTC
+                * texture state:
+                */
+               sam = ir3_SAM(b, opc, type, 0b1000, flags,
+                               tex_idx, tex_idx, col0, col1);
+
+               array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
+
+               /* fixup .w component: */
+               split_dest(b, &dst[3], sam, 3, 1);
+       } else {
+               /* normal (non-workaround) case: */
+               split_dest(b, dst, sam, 0, 4);
+       }
+
+       /* GETLOD returns results in 4.8 fixed point */
+       if (opc == OPC_GETLOD) {
+               struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+               compile_assert(ctx, tex->dest_type == nir_type_float);
+               for (i = 0; i < 2; i++) {
+                       dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+                                                          factor, 0);
+               }
+       }
+
+       put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction **dst, *sam;
+
+       dst = get_dst(ctx, &tex->dest, 1);
+
+       sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
+                       tex->texture_index, tex->texture_index, NULL, NULL);
+
+       /* even though there is only one component, since it ends
+        * up in .z rather than .x, we need a split_dest()
+        */
+       split_dest(b, dst, sam, 0, 3);
+
+       /* The # of levels comes from getinfo.z. We need to add 1 to it, since
+        * the value in TEX_CONST_0 is zero-based.
+        */
+       if (ctx->compiler->levels_add_one)
+               dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
+
+       put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction **dst, *sam;
+       struct ir3_instruction *lod;
+       unsigned flags, coords;
+
+       tex_info(tex, &flags, &coords);
+
+       /* Actually we want the number of dimensions, not coordinates. This
+        * distinction only matters for cubes.
+        */
+       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+               coords = 2;
+
+       dst = get_dst(ctx, &tex->dest, 4);
+
+       compile_assert(ctx, tex->num_srcs == 1);
+       compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
+
+       lod = get_src(ctx, &tex->src[0].src)[0];
+
+       sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
+                       tex->texture_index, tex->texture_index, lod, NULL);
+
+       split_dest(b, dst, sam, 0, 4);
+
+       /* Array size actually ends up in .w rather than .z. This doesn't
+        * matter for miplevel 0, but for higher mips the value in z is
+        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+        * returned, which means that we have to add 1 to it for arrays.
+        */
+       if (tex->is_array) {
+               if (ctx->compiler->levels_add_one) {
+                       dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
+               } else {
+                       dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
+               }
+       }
+
+       put_dst(ctx, &tex->dest);
+}
+
+static void
+emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
+{
+       switch (jump->type) {
+       case nir_jump_break:
+       case nir_jump_continue:
+       case nir_jump_return:
+               /* I *think* we can simply just ignore this, and use the
+                * successor block link to figure out where we need to
+                * jump to for break/continue
+                */
+               break;
+       default:
+               compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+               break;
+       }
+}
+
+static void
+emit_instr(struct ir3_context *ctx, nir_instr *instr)
+{
+       switch (instr->type) {
+       case nir_instr_type_alu:
+               emit_alu(ctx, nir_instr_as_alu(instr));
+               break;
+       case nir_instr_type_deref:
+               /* ignored, handled as part of the intrinsic they are src to */
+               break;
+       case nir_instr_type_intrinsic:
+               emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+               break;
+       case nir_instr_type_load_const:
+               emit_load_const(ctx, nir_instr_as_load_const(instr));
+               break;
+       case nir_instr_type_ssa_undef:
+               emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+               break;
+       case nir_instr_type_tex: {
+               nir_tex_instr *tex = nir_instr_as_tex(instr);
+               /* couple tex instructions get special-cased:
+                */
+               switch (tex->op) {
+               case nir_texop_txs:
+                       emit_tex_txs(ctx, tex);
+                       break;
+               case nir_texop_query_levels:
+                       emit_tex_query_levels(ctx, tex);
+                       break;
+               default:
+                       emit_tex(ctx, tex);
+                       break;
+               }
+               break;
+       }
+       case nir_instr_type_jump:
+               emit_jump(ctx, nir_instr_as_jump(instr));
+               break;
+       case nir_instr_type_phi:
+               /* we have converted phi webs to regs in NIR by now */
+               compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
+               break;
+       case nir_instr_type_call:
+       case nir_instr_type_parallel_copy:
+               compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
+               break;
+       }
+}
+
+static struct ir3_block *
+get_block(struct ir3_context *ctx, const nir_block *nblock)
+{
+       struct ir3_block *block;
+       struct hash_entry *hentry;
+       unsigned i;
+
+       hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
+       if (hentry)
+               return hentry->data;
+
+       block = ir3_block_create(ctx->ir);
+       block->nblock = nblock;
+       _mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+       block->predecessors_count = nblock->predecessors->entries;
+       block->predecessors = ralloc_array_size(block,
+               sizeof(block->predecessors[0]), block->predecessors_count);
+       i = 0;
+       set_foreach(nblock->predecessors, sentry) {
+               block->predecessors[i++] = get_block(ctx, sentry->key);
+       }
+
+       return block;
+}
+
+static void
+emit_block(struct ir3_context *ctx, nir_block *nblock)
+{
+       struct ir3_block *block = get_block(ctx, nblock);
+
+       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+               if (nblock->successors[i]) {
+                       block->successors[i] =
+                               get_block(ctx, nblock->successors[i]);
+               }
+       }
+
+       ctx->block = block;
+       list_addtail(&block->node, &ctx->ir->block_list);
+
+       /* re-emit addr register in each block if needed: */
+       for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
+               _mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
+               ctx->addr_ht[i] = NULL;
+       }
+
+       nir_foreach_instr(instr, nblock) {
+               ctx->cur_instr = instr;
+               emit_instr(ctx, instr);
+               ctx->cur_instr = NULL;
+               if (ctx->error)
+                       return;
+       }
+}
+
+static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct ir3_context *ctx, nir_if *nif)
+{
+       struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+       ctx->block->condition =
+               get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+       emit_cf_list(ctx, &nif->then_list);
+       emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_context *ctx, nir_loop *nloop)
+{
+       emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
+{
+       foreach_list_typed(nir_cf_node, node, node, list) {
+               switch (node->type) {
+               case nir_cf_node_block:
+                       emit_block(ctx, nir_cf_node_as_block(node));
+                       break;
+               case nir_cf_node_if:
+                       emit_if(ctx, nir_cf_node_as_if(node));
+                       break;
+               case nir_cf_node_loop:
+                       emit_loop(ctx, nir_cf_node_as_loop(node));
+                       break;
+               case nir_cf_node_function:
+                       compile_error(ctx, "TODO\n");
+                       break;
+               }
+       }
+}
+
+/* emit stream-out code.  At this point, the current block is the original
+ * (nir) end block, and nir ensures that all flow control paths terminate
+ * into the end block.  We re-purpose the original end block to generate
+ * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
+ * block holding stream-out write instructions, followed by the new end
+ * block:
+ *
+ *   blockOrigEnd {
+ *      p0.x = (vtxcnt < maxvtxcnt)
+ *      // succs: blockStreamOut, blockNewEnd
+ *   }
+ *   blockStreamOut {
+ *      ... stream-out instructions ...
+ *      // succs: blockNewEnd
+ *   }
+ *   blockNewEnd {
+ *   }
+ */
+static void
+emit_stream_out(struct ir3_context *ctx)
+{
+       struct ir3_shader_variant *v = ctx->so;
+       struct ir3 *ir = ctx->ir;
+       struct ir3_stream_output_info *strmout =
+                       &ctx->so->shader->stream_output;
+       struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
+       struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
+       struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
+
+       /* create vtxcnt input in input block at top of shader,
+        * so that it is seen as live over the entire duration
+        * of the shader:
+        */
+       vtxcnt = create_input(ctx, 0);
+       add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
+
+       maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
+
+       /* at this point, we are at the original 'end' block,
+        * re-purpose this block to stream-out condition, then
+        * append stream-out block and new-end block
+        */
+       orig_end_block = ctx->block;
+
+// TODO these blocks need to update predecessors..
+// maybe w/ store_global intrinsic, we could do this
+// stuff in nir->nir pass
+
+       stream_out_block = ir3_block_create(ir);
+       list_addtail(&stream_out_block->node, &ir->block_list);
+
+       new_end_block = ir3_block_create(ir);
+       list_addtail(&new_end_block->node, &ir->block_list);
+
+       orig_end_block->successors[0] = stream_out_block;
+       orig_end_block->successors[1] = new_end_block;
+       stream_out_block->successors[0] = new_end_block;
+
+       /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
+       cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
+       cond->regs[0]->num = regid(REG_P0, 0);
+       cond->cat2.condition = IR3_COND_LT;
+
+       /* condition goes on previous block to the conditional,
+        * since it is used to pick which of the two successor
+        * paths to take:
+        */
+       orig_end_block->condition = cond;
+
+       /* switch to stream_out_block to generate the stream-out
+        * instructions:
+        */
+       ctx->block = stream_out_block;
+
+       /* Calculate base addresses based on vtxcnt.  Instructions
+        * generated for bases not used in following loop will be
+        * stripped out in the backend.
+        */
+       for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+               unsigned stride = strmout->stride[i];
+               struct ir3_instruction *base, *off;
+
+               base = create_uniform(ctx, regid(v->constbase.tfbo, i));
+
+               /* 24-bit should be enough: */
+               off = ir3_MUL_U(ctx->block, vtxcnt, 0,
+                               create_immed(ctx->block, stride * 4), 0);
+
+               bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
+       }
+
+       /* Generate the per-output store instructions: */
+       for (unsigned i = 0; i < strmout->num_outputs; i++) {
+               for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
+                       unsigned c = j + strmout->output[i].start_component;
+                       struct ir3_instruction *base, *out, *stg;
+
+                       base = bases[strmout->output[i].output_buffer];
+                       out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
+
+                       stg = ir3_STG(ctx->block, base, 0, out, 0,
+                                       create_immed(ctx->block, 1), 0);
+                       stg->cat6.type = TYPE_U32;
+                       stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
+
+                       array_insert(ctx->block, ctx->block->keeps, stg);
+               }
+       }
+
+       /* and finally switch to the new_end_block: */
+       ctx->block = new_end_block;
+}
+
+static void
+emit_function(struct ir3_context *ctx, nir_function_impl *impl)
+{
+       nir_metadata_require(impl, nir_metadata_block_index);
+
+       emit_cf_list(ctx, &impl->body);
+       emit_block(ctx, impl->end_block);
+
+       /* at this point, we should have a single empty block,
+        * into which we emit the 'end' instruction.
+        */
+       compile_assert(ctx, list_empty(&ctx->block->instr_list));
+
+       /* If stream-out (aka transform-feedback) enabled, emit the
+        * stream-out instructions, followed by a new empty block (into
+        * which the 'end' instruction lands).
+        *
+        * NOTE: it is done in this order, rather than inserting before
+        * we emit end_block, because NIR guarantees that all blocks
+        * flow into end_block, and that end_block has no successors.
+        * So by re-purposing end_block as the first block of stream-
+        * out, we guarantee that all exit paths flow into the stream-
+        * out instructions.
+        */
+       if ((ctx->compiler->gpu_id < 500) &&
+                       (ctx->so->shader->stream_output.num_outputs > 0) &&
+                       !ctx->so->binning_pass) {
+               debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
+               emit_stream_out(ctx);
+       }
+
+       ir3_END(ctx->block);
+}
+
+static struct ir3_instruction *
+create_frag_coord(struct ir3_context *ctx, unsigned comp)
+{
+       struct ir3_block *block = ctx->block;
+       struct ir3_instruction *instr;
+
+       if (!ctx->frag_coord) {
+               ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
+               /* defer add_sysval_input() until after all inputs created */
+       }
+
+       split_dest(block, &instr, ctx->frag_coord, comp, 1);
+
+       switch (comp) {
+       case 0: /* .x */
+       case 1: /* .y */
+               /* for frag_coord, we get unsigned values.. we need
+                * to subtract (integer) 8 and divide by 16 (right-
+                * shift by 4) then convert to float:
+                *
+                *    sub.s tmp, src, 8
+                *    shr.b tmp, tmp, 4
+                *    mov.u32f32 dst, tmp
+                *
+                */
+               instr = ir3_SUB_S(block, instr, 0,
+                               create_immed(block, 8), 0);
+               instr = ir3_SHR_B(block, instr, 0,
+                               create_immed(block, 4), 0);
+               instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
+
+               return instr;
+       case 2: /* .z */
+       case 3: /* .w */
+       default:
+               /* seems that we can use these as-is: */
+               return instr;
+       }
+}
+
+static void
+setup_input(struct ir3_context *ctx, nir_variable *in)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned ncomp = glsl_get_components(in->type);
+       unsigned n = in->data.driver_location;
+       unsigned slot = in->data.location;
+
+       /* let's pretend things other than vec4 don't exist: */
+       ncomp = MAX2(ncomp, 4);
+
+       /* skip unread inputs, we could end up with (for example), unsplit
+        * matrix/etc inputs in the case they are not read, so just silently
+        * skip these.
+        */
+       if (ncomp > 4)
+               return;
+
+       compile_assert(ctx, ncomp == 4);
+
+       so->inputs[n].slot = slot;
+       so->inputs[n].compmask = (1 << ncomp) - 1;
+       so->inputs_count = MAX2(so->inputs_count, n + 1);
+       so->inputs[n].interpolate = in->data.interpolation;
+
+       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+               for (int i = 0; i < ncomp; i++) {
+                       struct ir3_instruction *instr = NULL;
+                       unsigned idx = (n * 4) + i;
+
+                       if (slot == VARYING_SLOT_POS) {
+                               so->inputs[n].bary = false;
+                               so->frag_coord = true;
+                               instr = create_frag_coord(ctx, i);
+                       } else if (slot == VARYING_SLOT_PNTC) {
+                               /* see for example st_nir_fixup_varying_slots().. this is
+                                * maybe a bit mesa/st specific.  But we need things to line
+                                * up for this in fdN_program:
+                                *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
+                                *    if (emit->sprite_coord_enable & texmask) {
+                                *       ...
+                                *    }
+                                */
+                               so->inputs[n].slot = VARYING_SLOT_VAR8;
+                               so->inputs[n].bary = true;
+                               instr = create_frag_input(ctx, false);
+                       } else {
+                               bool use_ldlv = false;
+
+                               /* detect the special case for front/back colors where
+                                * we need to do flat vs smooth shading depending on
+                                * rast state:
+                                */
+                               if (in->data.interpolation == INTERP_MODE_NONE) {
+                                       switch (slot) {
+                                       case VARYING_SLOT_COL0:
+                                       case VARYING_SLOT_COL1:
+                                       case VARYING_SLOT_BFC0:
+                                       case VARYING_SLOT_BFC1:
+                                               so->inputs[n].rasterflat = true;
+                                               break;
+                                       default:
+                                               break;
+                                       }
+                               }
+
+                               if (ctx->compiler->flat_bypass) {
+                                       if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
+                                                       (so->inputs[n].rasterflat && ctx->so->key.rasterflat))
+                                               use_ldlv = true;
+                               }
+
+                               so->inputs[n].bary = true;
+
+                               instr = create_frag_input(ctx, use_ldlv);
+                       }
+
+                       compile_assert(ctx, idx < ctx->ir->ninputs);
+
+                       ctx->ir->inputs[idx] = instr;
+               }
+       } else if (ctx->so->type == MESA_SHADER_VERTEX) {
+               for (int i = 0; i < ncomp; i++) {
+                       unsigned idx = (n * 4) + i;
+                       compile_assert(ctx, idx < ctx->ir->ninputs);
+                       ctx->ir->inputs[idx] = create_input(ctx, idx);
+               }
+       } else {
+               compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+       }
+
+       if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
+               so->total_in += ncomp;
+       }
+}
+
+static void
+setup_output(struct ir3_context *ctx, nir_variable *out)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       unsigned ncomp = glsl_get_components(out->type);
+       unsigned n = out->data.driver_location;
+       unsigned slot = out->data.location;
+       unsigned comp = 0;
+
+       /* let's pretend things other than vec4 don't exist: */
+       ncomp = MAX2(ncomp, 4);
+       compile_assert(ctx, ncomp == 4);
+
+       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+               switch (slot) {
+               case FRAG_RESULT_DEPTH:
+                       comp = 2;  /* tgsi will write to .z component */
+                       so->writes_pos = true;
+                       break;
+               case FRAG_RESULT_COLOR:
+                       so->color0_mrt = 1;
+                       break;
+               default:
+                       if (slot >= FRAG_RESULT_DATA0)
+                               break;
+                       compile_error(ctx, "unknown FS output name: %s\n",
+                                       gl_frag_result_name(slot));
+               }
+       } else if (ctx->so->type == MESA_SHADER_VERTEX) {
+               switch (slot) {
+               case VARYING_SLOT_POS:
+                       so->writes_pos = true;
+                       break;
+               case VARYING_SLOT_PSIZ:
+                       so->writes_psize = true;
+                       break;
+               case VARYING_SLOT_COL0:
+               case VARYING_SLOT_COL1:
+               case VARYING_SLOT_BFC0:
+               case VARYING_SLOT_BFC1:
+               case VARYING_SLOT_FOGC:
+               case VARYING_SLOT_CLIP_DIST0:
+               case VARYING_SLOT_CLIP_DIST1:
+               case VARYING_SLOT_CLIP_VERTEX:
+                       break;
+               default:
+                       if (slot >= VARYING_SLOT_VAR0)
+                               break;
+                       if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+                               break;
+                       compile_error(ctx, "unknown VS output name: %s\n",
+                                       gl_varying_slot_name(slot));
+               }
+       } else {
+               compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+       }
+
+       compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
+
+       so->outputs[n].slot = slot;
+       so->outputs[n].regid = regid(n, comp);
+       so->outputs_count = MAX2(so->outputs_count, n + 1);
+
+       for (int i = 0; i < ncomp; i++) {
+               unsigned idx = (n * 4) + i;
+               compile_assert(ctx, idx < ctx->ir->noutputs);
+               ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
+       }
+}
+
+static int
+max_drvloc(struct exec_list *vars)
+{
+       int drvloc = -1;
+       nir_foreach_variable(var, vars) {
+               drvloc = MAX2(drvloc, (int)var->data.driver_location);
+       }
+       return drvloc;
+}
+
+static const unsigned max_sysvals[] = {
+       [MESA_SHADER_FRAGMENT] = 24,  // TODO
+       [MESA_SHADER_VERTEX]  = 16,
+       [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
+};
+
+static void
+emit_instructions(struct ir3_context *ctx)
+{
+       unsigned ninputs, noutputs;
+       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
+
+       ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
+       noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
+
+       /* we need to leave room for sysvals:
+        */
+       ninputs += max_sysvals[ctx->so->type];
+
+       ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+       /* Create inputs in first block: */
+       ctx->block = get_block(ctx, nir_start_block(fxn));
+       ctx->in_block = ctx->block;
+       list_addtail(&ctx->block->node, &ctx->ir->block_list);
+
+       ninputs -= max_sysvals[ctx->so->type];
+
+       /* for fragment shader, the vcoord input register is used as the
+        * base for bary.f varying fetch instrs:
+        */
+       struct ir3_instruction *vcoord = NULL;
+       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
+               struct ir3_instruction *xy[2];
+
+               vcoord = create_input_compmask(ctx, 0, 0x3);
+               split_dest(ctx->block, xy, vcoord, 0, 2);
+
+               ctx->frag_vcoord = create_collect(ctx, xy, 2);
+       }
+
+       /* Setup inputs: */
+       nir_foreach_variable(var, &ctx->s->inputs) {
+               setup_input(ctx, var);
+       }
+
+       /* Defer add_sysval_input() stuff until after setup_inputs(),
+        * because sysvals need to be appended after varyings:
+        */
+       if (vcoord) {
+               add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
+                               0x3, vcoord);
+       }
+
+       if (ctx->frag_coord) {
+               add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
+                               0xf, ctx->frag_coord);
+       }
+
+       /* Setup outputs: */
+       nir_foreach_variable(var, &ctx->s->outputs) {
+               setup_output(ctx, var);
+       }
+
+       /* Setup registers (which should only be arrays): */
+       nir_foreach_register(reg, &ctx->s->registers) {
+               declare_array(ctx, reg);
+       }
+
+       /* NOTE: need to do something more clever when we support >1 fxn */
+       nir_foreach_register(reg, &fxn->registers) {
+               declare_array(ctx, reg);
+       }
+       /* And emit the body: */
+       ctx->impl = fxn;
+       emit_function(ctx, fxn);
+}
+
+/* from NIR perspective, we actually have varying inputs.  But the varying
+ * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
+ * only actual inputs are the sysvals.
+ */
+static void
+fixup_frag_inputs(struct ir3_context *ctx)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       struct ir3 *ir = ctx->ir;
+       unsigned i = 0;
+
+       /* sysvals should appear at the end of the inputs, drop everything else: */
+       while ((i < so->inputs_count) && !so->inputs[i].sysval)
+               i++;
+
+       /* at IR level, inputs are always blocks of 4 scalars: */
+       i *= 4;
+
+       ir->inputs = &ir->inputs[i];
+       ir->ninputs -= i;
+}
+
+/* Fixup tex sampler state for astc/srgb workaround instructions.  We
+ * need to assign the tex state indexes for these after we know the
+ * max tex index.
+ */
+static void
+fixup_astc_srgb(struct ir3_context *ctx)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       /* indexed by original tex idx, value is newly assigned alpha sampler
+        * state tex idx.  Zero is invalid since there is at least one sampler
+        * if we get here.
+        */
+       unsigned alt_tex_state[16] = {0};
+       unsigned tex_idx = ctx->max_texture_index + 1;
+       unsigned idx = 0;
+
+       so->astc_srgb.base = tex_idx;
+
+       for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
+               struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
+
+               compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
+
+               if (alt_tex_state[sam->cat5.tex] == 0) {
+                       /* assign new alternate/alpha tex state slot: */
+                       alt_tex_state[sam->cat5.tex] = tex_idx++;
+                       so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
+                       so->astc_srgb.count++;
+               }
+
+               sam->cat5.tex = alt_tex_state[sam->cat5.tex];
+       }
+}
+
+static void
+fixup_binning_pass(struct ir3_context *ctx)
+{
+       struct ir3_shader_variant *so = ctx->so;
+       struct ir3 *ir = ctx->ir;
+       unsigned i, j;
+
+       for (i = 0, j = 0; i < so->outputs_count; i++) {
+               unsigned slot = so->outputs[i].slot;
+
+               /* throw away everything but first position/psize */
+               if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
+                       if (i != j) {
+                               so->outputs[j] = so->outputs[i];
+                               ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+                               ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+                               ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+                               ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
+                       }
+                       j++;
+               }
+       }
+       so->outputs_count = j;
+       ir->noutputs = j * 4;
+}
+
+int
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+               struct ir3_shader_variant *so)
+{
+       struct ir3_context *ctx;
+       struct ir3 *ir;
+       struct ir3_instruction **inputs;
+       unsigned i, actual_in, inloc;
+       int ret = 0, max_bary;
+
+       assert(!so->ir);
+
+       ctx = compile_init(compiler, so);
+       if (!ctx) {
+               DBG("INIT failed!");
+               ret = -1;
+               goto out;
+       }
+
+       emit_instructions(ctx);
+
+       if (ctx->error) {
+               DBG("EMIT failed!");
+               ret = -1;
+               goto out;
+       }
+
+       ir = so->ir = ctx->ir;
+
+       /* keep track of the inputs from TGSI perspective.. */
+       inputs = ir->inputs;
+
+       /* but fixup actual inputs for frag shader: */
+       if (so->type == MESA_SHADER_FRAGMENT)
+               fixup_frag_inputs(ctx);
+
+       /* at this point, for binning pass, throw away unneeded outputs: */
+       if (so->binning_pass && (ctx->compiler->gpu_id < 600))
+               fixup_binning_pass(ctx);
+
+       /* if we want half-precision outputs, mark the output registers
+        * as half:
+        */
+       if (so->key.half_precision) {
+               for (i = 0; i < ir->noutputs; i++) {
+                       struct ir3_instruction *out = ir->outputs[i];
+
+                       if (!out)
+                               continue;
+
+                       /* if frag shader writes z, that needs to be full precision: */
+                       if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
+                               continue;
+
+                       out->regs[0]->flags |= IR3_REG_HALF;
+                       /* output could be a fanout (ie. texture fetch output)
+                        * in which case we need to propagate the half-reg flag
+                        * up to the definer so that RA sees it:
+                        */
+                       if (out->opc == OPC_META_FO) {
+                               out = out->regs[1]->instr;
+                               out->regs[0]->flags |= IR3_REG_HALF;
+                       }
+
+                       if (out->opc == OPC_MOV) {
+                               out->cat1.dst_type = half_type(out->cat1.dst_type);
+                       }
+               }
+       }
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("BEFORE CP:\n");
+               ir3_print(ir);
+       }
+
+       ir3_cp(ir, so);
+
+       /* at this point, for binning pass, throw away unneeded outputs:
+        * Note that for a6xx and later, we do this after ir3_cp to ensure
+        * that the uniform/constant layout for BS and VS matches, so that
+        * we can re-use same VS_CONST state group.
+        */
+       if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
+               fixup_binning_pass(ctx);
+
+       /* Insert mov if there's same instruction for each output.
+        * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
+        */
+       for (int i = ir->noutputs - 1; i >= 0; i--) {
+               if (!ir->outputs[i])
+                       continue;
+               for (unsigned j = 0; j < i; j++) {
+                       if (ir->outputs[i] == ir->outputs[j]) {
+                               ir->outputs[i] =
+                                       ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
+                       }
+               }
+       }
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("BEFORE GROUPING:\n");
+               ir3_print(ir);
+       }
+
+       ir3_sched_add_deps(ir);
+
+       /* Group left/right neighbors, inserting mov's where needed to
+        * solve conflicts:
+        */
+       ir3_group(ir);
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("AFTER GROUPING:\n");
+               ir3_print(ir);
+       }
+
+       ir3_depth(ir);
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("AFTER DEPTH:\n");
+               ir3_print(ir);
+       }
+
+       ret = ir3_sched(ir);
+       if (ret) {
+               DBG("SCHED failed!");
+               goto out;
+       }
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("AFTER SCHED:\n");
+               ir3_print(ir);
+       }
+
+       ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
+       if (ret) {
+               DBG("RA failed!");
+               goto out;
+       }
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("AFTER RA:\n");
+               ir3_print(ir);
+       }
+
+       /* fixup input/outputs: */
+       for (i = 0; i < so->outputs_count; i++) {
+               so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
+       }
+
+       /* Note that some or all channels of an input may be unused: */
+       actual_in = 0;
+       inloc = 0;
+       for (i = 0; i < so->inputs_count; i++) {
+               unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
+               so->inputs[i].ncomp = 0;
+               so->inputs[i].inloc = inloc;
+               for (j = 0; j < 4; j++) {
+                       struct ir3_instruction *in = inputs[(i*4) + j];
+                       if (in && !(in->flags & IR3_INSTR_UNUSED)) {
+                               compmask |= (1 << j);
+                               reg = in->regs[0]->num - j;
+                               actual_in++;
+                               so->inputs[i].ncomp++;
+                               if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
+                                       /* assign inloc: */
+                                       assert(in->regs[1]->flags & IR3_REG_IMMED);
+                                       in->regs[1]->iim_val = inloc + j;
+                                       maxcomp = j + 1;
+                               }
+                       }
+               }
+               if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
+                       so->varying_in++;
+                       so->inputs[i].compmask = (1 << maxcomp) - 1;
+                       inloc += maxcomp;
+               } else if (!so->inputs[i].sysval) {
+                       so->inputs[i].compmask = compmask;
+               }
+               so->inputs[i].regid = reg;
+       }
+
+       if (ctx->astc_srgb)
+               fixup_astc_srgb(ctx);
+
+       /* We need to do legalize after (for frag shader's) the "bary.f"
+        * offsets (inloc) have been assigned.
+        */
+       ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               printf("AFTER LEGALIZE:\n");
+               ir3_print(ir);
+       }
+
+       /* Note that actual_in counts inputs that are not bary.f'd for FS: */
+       if (so->type == MESA_SHADER_VERTEX)
+               so->total_in = actual_in;
+       else
+               so->total_in = max_bary + 1;
+
+out:
+       if (ret) {
+               if (so->ir)
+                       ir3_destroy(so->ir);
+               so->ir = NULL;
+       }
+       compile_free(ctx);
+
+       return ret;
+}
diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c

new file mode 100644 (file)

index 0000000..e8e8cc3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_cp.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <math.h>
+
+#include "ir3.h"
+#include "ir3_shader.h"
+
+/*
+ * Copy Propagate:
+ */
+
+struct ir3_cp_ctx {
+       struct ir3 *shader;
+       struct ir3_shader_variant *so;
+       unsigned immediate_idx;
+};
+
+/* is it a type preserving mov, with ok flags? */
+static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
+{
+       if (is_same_type_mov(instr)) {
+               struct ir3_register *dst = instr->regs[0];
+               struct ir3_register *src = instr->regs[1];
+               struct ir3_instruction *src_instr = ssa(src);
+
+               /* only if mov src is SSA (not const/immed): */
+               if (!src_instr)
+                       return false;
+
+               /* no indirect: */
+               if (dst->flags & IR3_REG_RELATIV)
+                       return false;
+               if (src->flags & IR3_REG_RELATIV)
+                       return false;
+
+               if (src->flags & IR3_REG_ARRAY)
+                       return false;
+
+               if (!allow_flags)
+                       if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
+                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+                               return false;
+
+               /* TODO: remove this hack: */
+               if (src_instr->opc == OPC_META_FO)
+                       return false;
+
+               return true;
+       }
+       return false;
+}
+
+static unsigned cp_flags(unsigned flags)
+{
+       /* only considering these flags (at least for now): */
+       flags &= (IR3_REG_CONST | IR3_REG_IMMED |
+                       IR3_REG_FNEG | IR3_REG_FABS |
+                       IR3_REG_SNEG | IR3_REG_SABS |
+                       IR3_REG_BNOT | IR3_REG_RELATIV);
+       return flags;
+}
+
+static bool valid_flags(struct ir3_instruction *instr, unsigned n,
+               unsigned flags)
+{
+       unsigned valid_flags;
+       flags = cp_flags(flags);
+
+       /* If destination is indirect, then source cannot be.. at least
+        * I don't think so..
+        */
+       if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+                       (flags & IR3_REG_RELATIV))
+               return false;
+
+       /* TODO it seems to *mostly* work to cp RELATIV, except we get some
+        * intermittent piglit variable-indexing fails.  Newer blob driver
+        * doesn't seem to cp these.  Possibly this is hw workaround?  Not
+        * sure, but until that is understood better, lets just switch off
+        * cp for indirect src's:
+        */
+       if (flags & IR3_REG_RELATIV)
+               return false;
+
+       switch (opc_cat(instr->opc)) {
+       case 1:
+               valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
+               if (flags & ~valid_flags)
+                       return false;
+               break;
+       case 2:
+               valid_flags = ir3_cat2_absneg(instr->opc) |
+                               IR3_REG_CONST | IR3_REG_RELATIV;
+
+               if (ir3_cat2_int(instr->opc))
+                       valid_flags |= IR3_REG_IMMED;
+
+               if (flags & ~valid_flags)
+                       return false;
+
+               if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+                       unsigned m = (n ^ 1) + 1;
+                       /* cannot deal w/ const in both srcs:
+                        * (note that some cat2 actually only have a single src)
+                        */
+                       if (m < instr->regs_count) {
+                               struct ir3_register *reg = instr->regs[m];
+                               if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
+                                       return false;
+                               if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
+                                       return false;
+                       }
+                       /* cannot be const + ABS|NEG: */
+                       if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+                               return false;
+               }
+               break;
+       case 3:
+               valid_flags = ir3_cat3_absneg(instr->opc) |
+                               IR3_REG_CONST | IR3_REG_RELATIV;
+
+               if (flags & ~valid_flags)
+                       return false;
+
+               if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
+                       /* cannot deal w/ const/relativ in 2nd src: */
+                       if (n == 1)
+                               return false;
+               }
+
+               if (flags & IR3_REG_CONST) {
+                       /* cannot be const + ABS|NEG: */
+                       if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
+                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
+                               return false;
+               }
+               break;
+       case 4:
+               /* seems like blob compiler avoids const as src.. */
+               /* TODO double check if this is still the case on a4xx */
+               if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
+                       return false;
+               if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
+                       return false;
+               break;
+       case 5:
+               /* no flags allowed */
+               if (flags)
+                       return false;
+               break;
+       case 6:
+               valid_flags = IR3_REG_IMMED;
+               if (flags & ~valid_flags)
+                       return false;
+
+               if (flags & IR3_REG_IMMED) {
+                       /* doesn't seem like we can have immediate src for store
+                        * instructions:
+                        *
+                        * TODO this restriction could also apply to load instructions,
+                        * but for load instructions this arg is the address (and not
+                        * really sure any good way to test a hard-coded immed addr src)
+                        */
+                       if (is_store(instr) && (n == 1))
+                               return false;
+
+                       if ((instr->opc == OPC_LDL) && (n != 1))
+                               return false;
+
+                       if ((instr->opc == OPC_STL) && (n != 2))
+                               return false;
+
+                       /* disallow CP into anything but the SSBO slot argument for
+                        * atomics:
+                        */
+                       if (is_atomic(instr->opc) && (n != 0))
+                               return false;
+
+                       if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
+                               return false;
+               }
+
+               break;
+       }
+
+       return true;
+}
+
+/* propagate register flags from src to dst.. negates need special
+ * handling to cancel each other out.
+ */
+static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
+{
+       unsigned srcflags = src->regs[1]->flags;
+
+       /* if what we are combining into already has (abs) flags,
+        * we can drop (neg) from src:
+        */
+       if (*dstflags & IR3_REG_FABS)
+               srcflags &= ~IR3_REG_FNEG;
+       if (*dstflags & IR3_REG_SABS)
+               srcflags &= ~IR3_REG_SNEG;
+
+       if (srcflags & IR3_REG_FABS)
+               *dstflags |= IR3_REG_FABS;
+       if (srcflags & IR3_REG_SABS)
+               *dstflags |= IR3_REG_SABS;
+       if (srcflags & IR3_REG_FNEG)
+               *dstflags ^= IR3_REG_FNEG;
+       if (srcflags & IR3_REG_SNEG)
+               *dstflags ^= IR3_REG_SNEG;
+       if (srcflags & IR3_REG_BNOT)
+               *dstflags ^= IR3_REG_BNOT;
+
+       *dstflags &= ~IR3_REG_SSA;
+       *dstflags |= srcflags & IR3_REG_SSA;
+       *dstflags |= srcflags & IR3_REG_CONST;
+       *dstflags |= srcflags & IR3_REG_IMMED;
+       *dstflags |= srcflags & IR3_REG_RELATIV;
+       *dstflags |= srcflags & IR3_REG_ARRAY;
+
+       /* if src of the src is boolean we can drop the (abs) since we know
+        * the source value is already a postitive integer.  This cleans
+        * up the absnegs that get inserted when converting between nir and
+        * native boolean (see ir3_b2n/n2b)
+        */
+       struct ir3_instruction *srcsrc = ssa(src->regs[1]);
+       if (srcsrc && is_bool(srcsrc))
+               *dstflags &= ~IR3_REG_SABS;
+}
+
+static struct ir3_register *
+lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
+{
+       unsigned swiz, idx, i;
+
+       reg = ir3_reg_clone(ctx->shader, reg);
+
+       /* in some cases, there are restrictions on (abs)/(neg) plus const..
+        * so just evaluate those and clear the flags:
+        */
+       if (new_flags & IR3_REG_SABS) {
+               reg->iim_val = abs(reg->iim_val);
+               new_flags &= ~IR3_REG_SABS;
+       }
+
+       if (new_flags & IR3_REG_FABS) {
+               reg->fim_val = fabs(reg->fim_val);
+               new_flags &= ~IR3_REG_FABS;
+       }
+
+       if (new_flags & IR3_REG_SNEG) {
+               reg->iim_val = -reg->iim_val;
+               new_flags &= ~IR3_REG_SNEG;
+       }
+
+       if (new_flags & IR3_REG_FNEG) {
+               reg->fim_val = -reg->fim_val;
+               new_flags &= ~IR3_REG_FNEG;
+       }
+
+       /* Reallocate for 4 more elements whenever it's necessary */
+       if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
+               ctx->so->immediates_size += 4;
+               ctx->so->immediates = realloc (ctx->so->immediates,
+                       ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
+       }
+
+       for (i = 0; i < ctx->immediate_idx; i++) {
+               swiz = i % 4;
+               idx  = i / 4;
+
+               if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
+                       break;
+               }
+       }
+
+       if (i == ctx->immediate_idx) {
+               /* need to generate a new immediate: */
+               swiz = i % 4;
+               idx  = i / 4;
+               ctx->so->immediates[idx].val[swiz] = reg->uim_val;
+               ctx->so->immediates_count = idx + 1;
+               ctx->immediate_idx++;
+       }
+
+       new_flags &= ~IR3_REG_IMMED;
+       new_flags |= IR3_REG_CONST;
+       reg->flags = new_flags;
+       reg->num = i + (4 * ctx->so->constbase.immediate);
+
+       return reg;
+}
+
+static void
+unuse(struct ir3_instruction *instr)
+{
+       debug_assert(instr->use_count > 0);
+
+       if (--instr->use_count == 0) {
+               struct ir3_block *block = instr->block;
+
+               instr->barrier_class = 0;
+               instr->barrier_conflict = 0;
+
+               /* we don't want to remove anything in keeps (which could
+                * be things like array store's)
+                */
+               for (unsigned i = 0; i < block->keeps_count; i++) {
+                       debug_assert(block->keeps[i] != instr);
+               }
+       }
+}
+
+/**
+ * Handle cp for a given src register.  This additionally handles
+ * the cases of collapsing immedate/const (which replace the src
+ * register with a non-ssa src) or collapsing mov's from relative
+ * src (which needs to also fixup the address src reference by the
+ * instruction).
+ */
+static void
+reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
+               struct ir3_register *reg, unsigned n)
+{
+       struct ir3_instruction *src = ssa(reg);
+
+       if (is_eligible_mov(src, true)) {
+               /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+               struct ir3_register *src_reg = src->regs[1];
+               unsigned new_flags = reg->flags;
+
+               combine_flags(&new_flags, src);
+
+               if (valid_flags(instr, n, new_flags)) {
+                       if (new_flags & IR3_REG_ARRAY) {
+                               debug_assert(!(reg->flags & IR3_REG_ARRAY));
+                               reg->array = src_reg->array;
+                       }
+                       reg->flags = new_flags;
+                       reg->instr = ssa(src_reg);
+
+                       instr->barrier_class |= src->barrier_class;
+                       instr->barrier_conflict |= src->barrier_conflict;
+
+                       unuse(src);
+                       reg->instr->use_count++;
+               }
+
+       } else if (is_same_type_mov(src) &&
+                       /* cannot collapse const/immed/etc into meta instrs: */
+                       !is_meta(instr)) {
+               /* immed/const/etc cases, which require some special handling: */
+               struct ir3_register *src_reg = src->regs[1];
+               unsigned new_flags = reg->flags;
+
+               combine_flags(&new_flags, src);
+
+               if (!valid_flags(instr, n, new_flags)) {
+                       /* See if lowering an immediate to const would help. */
+                       if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+                               debug_assert(new_flags & IR3_REG_IMMED);
+                               instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
+                               return;
+                       }
+
+                       /* special case for "normal" mad instructions, we can
+                        * try swapping the first two args if that fits better.
+                        *
+                        * the "plain" MAD's (ie. the ones that don't shift first
+                        * src prior to multiply) can swap their first two srcs if
+                        * src[0] is !CONST and src[1] is CONST:
+                        */
+                       if ((n == 1) && is_mad(instr->opc) &&
+                                       !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
+                                       valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
+                               /* swap src[0] and src[1]: */
+                               struct ir3_register *tmp;
+                               tmp = instr->regs[0 + 1];
+                               instr->regs[0 + 1] = instr->regs[1 + 1];
+                               instr->regs[1 + 1] = tmp;
+
+                               n = 0;
+                       } else {
+                               return;
+                       }
+               }
+
+               /* Here we handle the special case of mov from
+                * CONST and/or RELATIV.  These need to be handled
+                * specially, because in the case of move from CONST
+                * there is no src ir3_instruction so we need to
+                * replace the ir3_register.  And in the case of
+                * RELATIV we need to handle the address register
+                * dependency.
+                */
+               if (src_reg->flags & IR3_REG_CONST) {
+                       /* an instruction cannot reference two different
+                        * address registers:
+                        */
+                       if ((src_reg->flags & IR3_REG_RELATIV) &&
+                                       conflicts(instr->address, reg->instr->address))
+                               return;
+
+                       /* This seems to be a hw bug, or something where the timings
+                        * just somehow don't work out.  This restriction may only
+                        * apply if the first src is also CONST.
+                        */
+                       if ((opc_cat(instr->opc) == 3) && (n == 2) &&
+                                       (src_reg->flags & IR3_REG_RELATIV) &&
+                                       (src_reg->array.offset == 0))
+                               return;
+
+                       src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+                       src_reg->flags = new_flags;
+                       instr->regs[n+1] = src_reg;
+
+                       if (src_reg->flags & IR3_REG_RELATIV)
+                               ir3_instr_set_address(instr, reg->instr->address);
+
+                       return;
+               }
+
+               if ((src_reg->flags & IR3_REG_RELATIV) &&
+                               !conflicts(instr->address, reg->instr->address)) {
+                       src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+                       src_reg->flags = new_flags;
+                       instr->regs[n+1] = src_reg;
+                       ir3_instr_set_address(instr, reg->instr->address);
+
+                       return;
+               }
+
+               /* NOTE: seems we can only do immed integers, so don't
+                * need to care about float.  But we do need to handle
+                * abs/neg *before* checking that the immediate requires
+                * few enough bits to encode:
+                *
+                * TODO: do we need to do something to avoid accidentally
+                * catching a float immed?
+                */
+               if (src_reg->flags & IR3_REG_IMMED) {
+                       int32_t iim_val = src_reg->iim_val;
+
+                       debug_assert((opc_cat(instr->opc) == 1) ||
+                                       (opc_cat(instr->opc) == 6) ||
+                                       ir3_cat2_int(instr->opc) ||
+                                       (is_mad(instr->opc) && (n == 0)));
+
+                       if (new_flags & IR3_REG_SABS)
+                               iim_val = abs(iim_val);
+
+                       if (new_flags & IR3_REG_SNEG)
+                               iim_val = -iim_val;
+
+                       if (new_flags & IR3_REG_BNOT)
+                               iim_val = ~iim_val;
+
+                       /* other than category 1 (mov) we can only encode up to 10 bits: */
+                       if ((instr->opc == OPC_MOV) ||
+                                       !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
+                               new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+                               src_reg = ir3_reg_clone(instr->block->shader, src_reg);
+                               src_reg->flags = new_flags;
+                               src_reg->iim_val = iim_val;
+                               instr->regs[n+1] = src_reg;
+                       } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
+                               /* See if lowering an immediate to const would help. */
+                               instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
+                       }
+
+                       return;
+               }
+       }
+}
+
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
+ */
+static struct ir3_instruction *
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+       if (is_eligible_mov(instr, false)) {
+               struct ir3_register *reg = instr->regs[1];
+               if (!(reg->flags & IR3_REG_ARRAY)) {
+                       struct ir3_instruction *src_instr = ssa(reg);
+                       debug_assert(src_instr);
+                       return src_instr;
+               }
+       }
+       return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
+{
+       struct ir3_register *reg;
+
+       if (instr->regs_count == 0)
+               return;
+
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       /* walk down the graph from each src: */
+       foreach_src_n(reg, n, instr) {
+               struct ir3_instruction *src = ssa(reg);
+
+               if (!src)
+                       continue;
+
+               instr_cp(ctx, src);
+
+               /* TODO non-indirect access we could figure out which register
+                * we actually want and allow cp..
+                */
+               if (reg->flags & IR3_REG_ARRAY)
+                       continue;
+
+               /* Don't CP absneg into meta instructions, that won't end well: */
+               if (is_meta(instr) && (src->opc != OPC_MOV))
+                       continue;
+
+               reg_cp(ctx, instr, reg, n);
+       }
+
+       if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+               struct ir3_instruction *src = ssa(instr->regs[0]);
+               if (src)
+                       instr_cp(ctx, src);
+       }
+
+       if (instr->address) {
+               instr_cp(ctx, instr->address);
+               ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+       }
+
+       /* we can end up with extra cmps.s from frontend, which uses a
+        *
+        *    cmps.s p0.x, cond, 0
+        *
+        * as a way to mov into the predicate register.  But frequently 'cond'
+        * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
+        * just re-write the instruction writing predicate register to get rid
+        * of the double cmps.
+        */
+       if ((instr->opc == OPC_CMPS_S) &&
+                       (instr->regs[0]->num == regid(REG_P0, 0)) &&
+                       ssa(instr->regs[1]) &&
+                       (instr->regs[2]->flags & IR3_REG_IMMED) &&
+                       (instr->regs[2]->iim_val == 0)) {
+               struct ir3_instruction *cond = ssa(instr->regs[1]);
+               switch (cond->opc) {
+               case OPC_CMPS_S:
+               case OPC_CMPS_F:
+               case OPC_CMPS_U:
+                       instr->opc   = cond->opc;
+                       instr->flags = cond->flags;
+                       instr->cat2  = cond->cat2;
+                       instr->address = cond->address;
+                       instr->regs[1] = cond->regs[1];
+                       instr->regs[2] = cond->regs[2];
+                       instr->barrier_class |= cond->barrier_class;
+                       instr->barrier_conflict |= cond->barrier_conflict;
+                       unuse(cond);
+                       break;
+               default:
+                       break;
+               }
+       }
+}
+
+void
+ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
+{
+       struct ir3_cp_ctx ctx = {
+                       .shader = ir,
+                       .so = so,
+       };
+
+       /* This is a bit annoying, and probably wouldn't be necessary if we
+        * tracked a reverse link from producing instruction to consumer.
+        * But we need to know when we've eliminated the last consumer of
+        * a mov, so we need to do a pass to first count consumers of a
+        * mov.
+        */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       struct ir3_instruction *src;
+
+                       /* by the way, we don't account for false-dep's, so the CP
+                        * pass should always happen before false-dep's are inserted
+                        */
+                       debug_assert(instr->deps_count == 0);
+
+                       foreach_ssa_src(src, instr) {
+                               src->use_count++;
+                       }
+               }
+       }
+
+       ir3_clear_mark(ir);
+
+       for (unsigned i = 0; i < ir->noutputs; i++) {
+               if (ir->outputs[i]) {
+                       instr_cp(&ctx, ir->outputs[i]);
+                       ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
+               }
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               if (block->condition) {
+                       instr_cp(&ctx, block->condition);
+                       block->condition = eliminate_output_mov(block->condition);
+               }
+
+               for (unsigned i = 0; i < block->keeps_count; i++) {
+                       instr_cp(&ctx, block->keeps[i]);
+                       block->keeps[i] = eliminate_output_mov(block->keeps[i]);
+               }
+       }
+}
diff --git a/src/freedreno/ir3/ir3_depth.c b/src/freedreno/ir3/ir3_depth.c

new file mode 100644 (file)

index 0000000..73bf5e1
--- /dev/null
+++ b/src/freedreno/ir3/ir3_depth.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Depth:
+ *
+ * Calculates weighted instruction depth, ie. the sum of # of needed
+ * instructions plus delay slots back to original input (ie INPUT or
+ * CONST).  That is to say, an instructions depth is:
+ *
+ *   depth(instr) {
+ *     d = 0;
+ *     // for each src register:
+ *     foreach (src in instr->regs[1..n])
+ *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
+ *     return d + 1;
+ *   }
+ *
+ * After an instruction's depth is calculated, it is inserted into the
+ * blocks depth sorted list, which is used by the scheduling pass.
+ */
+
+/* generally don't count false dependencies, since this can just be
+ * something like a barrier, or SSBO store.  The exception is array
+ * dependencies if the assigner is an array write and the consumer
+ * reads the same array.
+ */
+static bool
+ignore_dep(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n)
+{
+       if (!__is_false_dep(consumer, n))
+               return false;
+
+       if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
+               struct ir3_register *dst = assigner->regs[0];
+               struct ir3_register *src;
+
+               debug_assert(dst->flags & IR3_REG_ARRAY);
+
+               foreach_src(src, consumer) {
+                       if ((src->flags & IR3_REG_ARRAY) &&
+                                       (dst->array.id == src->array.id)) {
+                               return false;
+                       }
+               }
+       }
+
+       return true;
+}
+
+/* calculate required # of delay slots between the instruction that
+ * assigns a value and the one that consumes
+ */
+int ir3_delayslots(struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer, unsigned n)
+{
+       if (ignore_dep(assigner, consumer, n))
+               return 0;
+
+       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
+        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
+        * handled with sync bits
+        */
+
+       if (is_meta(assigner))
+               return 0;
+
+       if (writes_addr(assigner))
+               return 6;
+
+       /* handled via sync flags: */
+       if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
+               return 0;
+
+       /* assigner must be alu: */
+       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
+                       is_mem(consumer)) {
+               return 6;
+       } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
+                       (n == 3)) {
+               /* special case, 3rd src to cat3 not required on first cycle */
+               return 1;
+       } else {
+               return 3;
+       }
+}
+
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
+{
+       /* remove from existing spot in list: */
+       list_delinit(&instr->node);
+
+       /* find where to re-insert instruction: */
+       list_for_each_entry (struct ir3_instruction, pos, list, node) {
+               if (pos->depth > instr->depth) {
+                       list_add(&instr->node, &pos->node);
+                       return;
+               }
+       }
+       /* if we get here, we didn't find an insertion spot: */
+       list_addtail(&instr->node, list);
+}
+
+static void
+ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
+{
+       struct ir3_instruction *src;
+
+       /* don't mark falsedep's as used, but otherwise process them normally: */
+       if (!falsedep)
+               instr->flags &= ~IR3_INSTR_UNUSED;
+
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       instr->depth = 0;
+
+       foreach_ssa_src_n(src, i, instr) {
+               unsigned sd;
+
+               /* visit child to compute it's depth: */
+               ir3_instr_depth(src, boost, __is_false_dep(instr, i));
+
+               /* for array writes, no need to delay on previous write: */
+               if (i == 0)
+                       continue;
+
+               sd = ir3_delayslots(src, instr, i) + src->depth;
+               sd += boost;
+
+               instr->depth = MAX2(instr->depth, sd);
+       }
+
+       if (!is_meta(instr))
+               instr->depth++;
+
+       ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static bool
+remove_unused_by_block(struct ir3_block *block)
+{
+       bool progress = false;
+       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+               if (instr->opc == OPC_END)
+                       continue;
+               if (instr->flags & IR3_INSTR_UNUSED) {
+                       list_delinit(&instr->node);
+                       progress = true;
+               }
+       }
+       return progress;
+}
+
+static bool
+compute_depth_and_remove_unused(struct ir3 *ir)
+{
+       unsigned i;
+       bool progress = false;
+
+       ir3_clear_mark(ir);
+
+       /* initially mark everything as unused, we'll clear the flag as we
+        * visit the instructions:
+        */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       instr->flags |= IR3_INSTR_UNUSED;
+               }
+       }
+
+       for (i = 0; i < ir->noutputs; i++)
+               if (ir->outputs[i])
+                       ir3_instr_depth(ir->outputs[i], 0, false);
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               for (i = 0; i < block->keeps_count; i++)
+                       ir3_instr_depth(block->keeps[i], 0, false);
+
+               /* We also need to account for if-condition: */
+               if (block->condition)
+                       ir3_instr_depth(block->condition, 6, false);
+       }
+
+       /* mark un-used instructions: */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               progress |= remove_unused_by_block(block);
+       }
+
+       /* note that we can end up with unused indirects, but we should
+        * not end up with unused predicates.
+        */
+       for (i = 0; i < ir->indirects_count; i++) {
+               struct ir3_instruction *instr = ir->indirects[i];
+               if (instr && (instr->flags & IR3_INSTR_UNUSED))
+                       ir->indirects[i] = NULL;
+       }
+
+       /* cleanup unused inputs: */
+       for (i = 0; i < ir->ninputs; i++) {
+               struct ir3_instruction *in = ir->inputs[i];
+               if (in && (in->flags & IR3_INSTR_UNUSED))
+                       ir->inputs[i] = NULL;
+       }
+
+       return progress;
+}
+
+void
+ir3_depth(struct ir3 *ir)
+{
+       bool progress;
+       do {
+               progress = compute_depth_and_remove_unused(ir);
+       } while (progress);
+}
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c

new file mode 100644 (file)

index 0000000..5700559
--- /dev/null
+++ b/src/freedreno/ir3/ir3_group.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "ir3.h"
+
+/*
+ * Find/group instruction neighbors:
+ */
+
+/* bleh.. we need to do the same group_n() thing for both inputs/outputs
+ * (where we have a simple instr[] array), and fanin nodes (where we have
+ * an extra indirection via reg->instr).
+ */
+struct group_ops {
+       struct ir3_instruction *(*get)(void *arr, int idx);
+       void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
+};
+
+static struct ir3_instruction *arr_get(void *arr, int idx)
+{
+       return ((struct ir3_instruction **)arr)[idx];
+}
+static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
+{
+       ((struct ir3_instruction **)arr)[idx] =
+                       ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
+{
+       /* so, we can't insert a mov in front of a meta:in.. and the downstream
+        * instruction already has a pointer to 'instr'.  So we cheat a bit and
+        * morph the meta:in instruction into a mov and insert a new meta:in
+        * in front.
+        */
+       struct ir3_instruction *in;
+
+       debug_assert(instr->regs_count == 1);
+
+       in = ir3_instr_create(instr->block, OPC_META_INPUT);
+       in->inout.block = instr->block;
+       ir3_reg_create(in, instr->regs[0]->num, 0);
+
+       /* create src reg for meta:in and fixup to now be a mov: */
+       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
+       instr->opc = OPC_MOV;
+       instr->cat1.src_type = TYPE_F32;
+       instr->cat1.dst_type = TYPE_F32;
+
+       ((struct ir3_instruction **)arr)[idx] = in;
+}
+static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
+static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
+
+static struct ir3_instruction *instr_get(void *arr, int idx)
+{
+       return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
+}
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+{
+       ((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+                       ir3_MOV(instr->block, instr, TYPE_F32);
+}
+static struct group_ops instr_ops = { instr_get, instr_insert_mov };
+
+/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
+static bool
+in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
+{
+       int idx = 0;
+
+       if (!instr)
+               return false;
+
+       if (instr == cur)
+               return true;
+
+       for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
+               if ((idx++ != pos) && (instr == cur))
+                       return true;
+
+       return false;
+}
+
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
+{
+       unsigned i, j;
+
+       /* first pass, figure out what has conflicts and needs a mov
+        * inserted.  Do this up front, before starting to setup
+        * left/right neighbor pointers.  Trying to do it in a single
+        * pass could result in a situation where we can't even setup
+        * the mov's right neighbor ptr if the next instr also needs
+        * a mov.
+        */
+restart:
+       for (i = 0; i < n; i++) {
+               struct ir3_instruction *instr = ops->get(arr, i);
+               if (instr) {
+                       struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+                       struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+                       bool conflict;
+
+                       /* check for left/right neighbor conflicts: */
+                       conflict = conflicts(instr->cp.left, left) ||
+                               conflicts(instr->cp.right, right);
+
+                       /* Mixing array elements and higher register classes
+                        * (ie. groups) doesn't really work out in RA.  See:
+                        *
+                        * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
+                        */
+                       if (instr->regs[0]->flags & IR3_REG_ARRAY)
+                               conflict = true;
+
+                       /* we also can't have an instr twice in the group: */
+                       for (j = i + 1; (j < n) && !conflict; j++)
+                               if (in_neighbor_list(ops->get(arr, j), instr, i))
+                                       conflict = true;
+
+                       if (conflict) {
+                               ops->insert_mov(arr, i, instr);
+                               /* inserting the mov may have caused a conflict
+                                * against the previous:
+                                */
+                               goto restart;
+                       }
+               }
+       }
+
+       /* second pass, now that we've inserted mov's, fixup left/right
+        * neighbors.  This is guaranteed to succeed, since by definition
+        * the newly inserted mov's cannot conflict with anything.
+        */
+       for (i = 0; i < n; i++) {
+               struct ir3_instruction *instr = ops->get(arr, i);
+               if (instr) {
+                       struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
+                       struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
+
+                       debug_assert(!conflicts(instr->cp.left, left));
+                       if (left) {
+                               instr->cp.left_cnt++;
+                               instr->cp.left = left;
+                       }
+
+                       debug_assert(!conflicts(instr->cp.right, right));
+                       if (right) {
+                               instr->cp.right_cnt++;
+                               instr->cp.right = right;
+                       }
+               }
+       }
+}
+
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
+{
+       struct ir3_instruction *src;
+
+       if (ir3_instr_check_mark(instr))
+               return;
+
+       if (instr->opc == OPC_META_FI)
+               group_n(&instr_ops, instr, instr->regs_count - 1);
+
+       foreach_ssa_src(src, instr)
+               instr_find_neighbors(src);
+}
+
+/* a bit of sadness.. we can't have "holes" in inputs from PoV of
+ * register assignment, they still need to be grouped together.  So
+ * we need to insert dummy/padding instruction for grouping, and
+ * then take it back out again before anyone notices.
+ */
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
+{
+       int i, mask = 0;
+       struct ir3_block *block = NULL;
+
+       for (i = n - 1; i >= 0; i--) {
+               struct ir3_instruction *instr = input[i];
+               if (instr) {
+                       block = instr->block;
+               } else if (block) {
+                       instr = ir3_NOP(block);
+                       ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
+                       input[i] = instr;
+                       mask |= (1 << i);
+               }
+       }
+
+       group_n(&arr_ops_in, input, n);
+
+       for (i = 0; i < n; i++) {
+               if (mask & (1 << i))
+                       input[i] = NULL;
+       }
+}
+
+static void
+find_neighbors(struct ir3 *ir)
+{
+       unsigned i;
+
+       /* shader inputs/outputs themselves must be contiguous as well:
+        *
+        * NOTE: group inputs first, since we only insert mov's
+        * *before* the conflicted instr (and that would go badly
+        * for inputs).  By doing inputs first, we should never
+        * have a conflict on inputs.. pushing any conflict to
+        * resolve to the outputs, for stuff like:
+        *
+        *     MOV OUT[n], IN[m].wzyx
+        *
+        * NOTE: we assume here inputs/outputs are grouped in vec4.
+        * This logic won't quite cut it if we don't align smaller
+        * on vec4 boundaries
+        */
+       for (i = 0; i < ir->ninputs; i += 4)
+               pad_and_group_input(&ir->inputs[i], 4);
+       for (i = 0; i < ir->noutputs; i += 4)
+               group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+       for (i = 0; i < ir->noutputs; i++) {
+               if (ir->outputs[i]) {
+                       struct ir3_instruction *instr = ir->outputs[i];
+                       instr_find_neighbors(instr);
+               }
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               for (i = 0; i < block->keeps_count; i++) {
+                       struct ir3_instruction *instr = block->keeps[i];
+                       instr_find_neighbors(instr);
+               }
+
+               /* We also need to account for if-condition: */
+               if (block->condition)
+                       instr_find_neighbors(block->condition);
+       }
+}
+
+void
+ir3_group(struct ir3 *ir)
+{
+       ir3_clear_mark(ir);
+       find_neighbors(ir);
+}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c

new file mode 100644 (file)

index 0000000..ff4c644
--- /dev/null
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -0,0 +1,496 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/ralloc.h"
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Legalize:
+ *
+ * We currently require that scheduling ensures that we have enough nop's
+ * in all the right places.  The legalize step mostly handles fixing up
+ * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
+ * into fewer nop's w/ rpt flag.
+ */
+
+struct ir3_legalize_ctx {
+       int num_samp;
+       bool has_ssbo;
+       int max_bary;
+};
+
+struct ir3_legalize_state {
+       regmask_t needs_ss;
+       regmask_t needs_ss_war;       /* write after read */
+       regmask_t needs_sy;
+};
+
+struct ir3_legalize_block_data {
+       bool valid;
+       struct ir3_legalize_state state;
+};
+
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union of
+ * all possible program paths.
+ *
+ * To do this, we need to know the output state (needs_ss/ss_war/sy)
+ * of all predecessor blocks.  The tricky thing is loops, which mean
+ * that we can't simply recursively process each predecessor block
+ * before legalizing the current block.
+ *
+ * How we handle that is by looping over all the blocks until the
+ * results converge.  If the output state of a given block changes
+ * in a given pass, this means that all successor blocks are not
+ * yet fully legalized.
+ */
+
+static bool
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_legalize_block_data *bd = block->data;
+
+       if (bd->valid)
+               return false;
+
+       struct ir3_instruction *last_input = NULL;
+       struct ir3_instruction *last_rel = NULL;
+       struct ir3_instruction *last_n = NULL;
+       struct list_head instr_list;
+       struct ir3_legalize_state prev_state = bd->state;
+       struct ir3_legalize_state *state = &bd->state;
+
+       /* our input state is the OR of all predecessor blocks' state: */
+       for (unsigned i = 0; i < block->predecessors_count; i++) {
+               struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
+               struct ir3_legalize_state *pstate = &pbd->state;
+
+               /* Our input (ss)/(sy) state is based on OR'ing the output
+                * state of all our predecessor blocks
+                */
+               regmask_or(&state->needs_ss,
+                               &state->needs_ss, &pstate->needs_ss);
+               regmask_or(&state->needs_ss_war,
+                               &state->needs_ss_war, &pstate->needs_ss_war);
+               regmask_or(&state->needs_sy,
+                               &state->needs_sy, &pstate->needs_sy);
+       }
+
+       /* remove all the instructions from the list, we'll be adding
+        * them back in as we go
+        */
+       list_replace(&block->instr_list, &instr_list);
+       list_inithead(&block->instr_list);
+
+       list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
+               struct ir3_register *reg;
+               unsigned i;
+
+               n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
+
+               if (is_meta(n))
+                       continue;
+
+               if (is_input(n)) {
+                       struct ir3_register *inloc = n->regs[1];
+                       assert(inloc->flags & IR3_REG_IMMED);
+                       ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
+               }
+
+               if (last_n && is_barrier(last_n))
+                       n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+
+               /* NOTE: consider dst register too.. it could happen that
+                * texture sample instruction (for example) writes some
+                * components which are unused.  A subsequent instruction
+                * that writes the same register can race w/ the sam instr
+                * resulting in undefined results:
+                */
+               for (i = 0; i < n->regs_count; i++) {
+                       reg = n->regs[i];
+
+                       if (reg_gpr(reg)) {
+
+                               /* TODO: we probably only need (ss) for alu
+                                * instr consuming sfu result.. need to make
+                                * some tests for both this and (sy)..
+                                */
+                               if (regmask_get(&state->needs_ss, reg)) {
+                                       n->flags |= IR3_INSTR_SS;
+                                       regmask_init(&state->needs_ss_war);
+                                       regmask_init(&state->needs_ss);
+                               }
+
+                               if (regmask_get(&state->needs_sy, reg)) {
+                                       n->flags |= IR3_INSTR_SY;
+                                       regmask_init(&state->needs_sy);
+                               }
+                       }
+
+                       /* TODO: is it valid to have address reg loaded from a
+                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
+                        * last_rel check below should be moved ahead of this:
+                        */
+                       if (reg->flags & IR3_REG_RELATIV)
+                               last_rel = n;
+               }
+
+               if (n->regs_count > 0) {
+                       reg = n->regs[0];
+                       if (regmask_get(&state->needs_ss_war, reg)) {
+                               n->flags |= IR3_INSTR_SS;
+                               regmask_init(&state->needs_ss_war);
+                               regmask_init(&state->needs_ss);
+                       }
+
+                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
+                               last_rel->flags |= IR3_INSTR_UL;
+                               last_rel = NULL;
+                       }
+               }
+
+               /* cat5+ does not have an (ss) bit, if needed we need to
+                * insert a nop to carry the sync flag.  Would be kinda
+                * clever if we were aware of this during scheduling, but
+                * this should be a pretty rare case:
+                */
+               if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
+                       struct ir3_instruction *nop;
+                       nop = ir3_NOP(block);
+                       nop->flags |= IR3_INSTR_SS;
+                       n->flags &= ~IR3_INSTR_SS;
+               }
+
+               /* need to be able to set (ss) on first instruction: */
+               if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+                       ir3_NOP(block);
+
+               if (is_nop(n) && !list_empty(&block->instr_list)) {
+                       struct ir3_instruction *last = list_last_entry(&block->instr_list,
+                                       struct ir3_instruction, node);
+                       if (is_nop(last) && (last->repeat < 5)) {
+                               last->repeat++;
+                               last->flags |= n->flags;
+                               continue;
+                       }
+               }
+
+               list_addtail(&n->node, &block->instr_list);
+
+               if (is_sfu(n))
+                       regmask_set(&state->needs_ss, n->regs[0]);
+
+               if (is_tex(n)) {
+                       /* this ends up being the # of samp instructions.. but that
+                        * is ok, everything else only cares whether it is zero or
+                        * not.  We do this here, rather than when we encounter a
+                        * SAMP decl, because (especially in binning pass shader)
+                        * the samp instruction(s) could get eliminated if the
+                        * result is not used.
+                        */
+                       ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
+                       regmask_set(&state->needs_sy, n->regs[0]);
+               } else if (n->opc == OPC_RESINFO) {
+                       regmask_set(&state->needs_ss, n->regs[0]);
+                       ir3_NOP(block)->flags |= IR3_INSTR_SS;
+               } else if (is_load(n)) {
+                       /* seems like ldlv needs (ss) bit instead??  which is odd but
+                        * makes a bunch of flat-varying tests start working on a4xx.
+                        */
+                       if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
+                               regmask_set(&state->needs_ss, n->regs[0]);
+                       else
+                               regmask_set(&state->needs_sy, n->regs[0]);
+               } else if (is_atomic(n->opc)) {
+                       if (n->flags & IR3_INSTR_G)
+                               regmask_set(&state->needs_sy, n->regs[0]);
+                       else
+                               regmask_set(&state->needs_ss, n->regs[0]);
+               }
+
+               if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
+                       ctx->has_ssbo = true;
+
+               /* both tex/sfu appear to not always immediately consume
+                * their src register(s):
+                */
+               if (is_tex(n) || is_sfu(n) || is_mem(n)) {
+                       foreach_src(reg, n) {
+                               if (reg_gpr(reg))
+                                       regmask_set(&state->needs_ss_war, reg);
+                       }
+               }
+
+               if (is_input(n))
+                       last_input = n;
+
+               last_n = n;
+       }
+
+       if (last_input) {
+               /* special hack.. if using ldlv to bypass interpolation,
+                * we need to insert a dummy bary.f on which we can set
+                * the (ei) flag:
+                */
+               if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
+                       struct ir3_instruction *baryf;
+
+                       /* (ss)bary.f (ei)r63.x, 0, r0.x */
+                       baryf = ir3_instr_create(block, OPC_BARY_F);
+                       baryf->flags |= IR3_INSTR_SS;
+                       ir3_reg_create(baryf, regid(63, 0), 0);
+                       ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+                       ir3_reg_create(baryf, regid(0, 0), 0);
+
+                       /* insert the dummy bary.f after last_input: */
+                       list_delinit(&baryf->node);
+                       list_add(&baryf->node, &last_input->node);
+
+                       last_input = baryf;
+               }
+               last_input->regs[0]->flags |= IR3_REG_EI;
+       }
+
+       if (last_rel)
+               last_rel->flags |= IR3_INSTR_UL;
+
+       bd->valid = true;
+
+       if (memcmp(&prev_state, state, sizeof(*state))) {
+               /* our output state changed, this invalidates all of our
+                * successors:
+                */
+               for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
+                       if (!block->successors[i])
+                               break;
+                       struct ir3_legalize_block_data *pbd = block->successors[i]->data;
+                       pbd->valid = false;
+               }
+       }
+
+       return true;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *     ...
+ *     0029:021: mov.s32s32 r62.x, r1.y
+ *     0082:022: br !p0.x, target=block5
+ *     0083:023: br p0.x, target=block4
+ *     // succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *     0084:024: jump, target=block6
+ *     // succs: block6;
+ *   }
+ *   block5 {
+ *     0085:025: jump, target=block7
+ *     // succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+       /* special case for last block: */
+       if (!block->successors[0])
+               return block;
+
+       /* NOTE that we may or may not have inserted the jump
+        * in the target block yet, so conditions to resolve
+        * the dest to the dest block's successor are:
+        *
+        *   (1) successor[1] == NULL &&
+        *   (2) (block-is-empty || only-instr-is-jump)
+        */
+       if (block->successors[1] == NULL) {
+               if (list_empty(&block->instr_list)) {
+                       return block->successors[0];
+               } else if (list_length(&block->instr_list) == 1) {
+                       struct ir3_instruction *instr = list_first_entry(
+                                       &block->instr_list, struct ir3_instruction, node);
+                       if (instr->opc == OPC_JUMP)
+                               return block->successors[0];
+               }
+       }
+       return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+       struct ir3_block *tblock =
+               resolve_dest_block(instr->cat0.target);
+       struct ir3_instruction *target;
+
+       if (tblock != instr->cat0.target) {
+               list_delinit(&instr->cat0.target->node);
+               instr->cat0.target = tblock;
+               return true;
+       }
+
+       target = list_first_entry(&tblock->instr_list,
+                               struct ir3_instruction, node);
+
+       /* TODO maybe a less fragile way to do this.  But we are expecting
+        * a pattern from sched_block() that looks like:
+        *
+        *   br !p0.x, #else-block
+        *   br p0.x, #if-block
+        *
+        * if the first branch target is +2, or if 2nd branch target is +1
+        * then we can just drop the jump.
+        */
+       unsigned next_block;
+       if (instr->cat0.inv == true)
+               next_block = 2;
+       else
+               next_block = 1;
+
+       if ((!target) || (target->ip == (instr->ip + next_block))) {
+               list_delinit(&instr->node);
+               return true;
+       } else {
+               instr->cat0.immed =
+                       (int)target->ip - (int)instr->ip;
+       }
+       return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+                       if (is_flow(instr) && instr->cat0.target)
+                               if (resolve_jump(instr))
+                                       return true;
+
+       return false;
+}
+
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       if (is_flow(instr) && instr->cat0.target) {
+                               struct ir3_instruction *target =
+                                       list_first_entry(&instr->cat0.target->instr_list,
+                                                       struct ir3_instruction, node);
+                               target->flags |= IR3_INSTR_JP;
+                       }
+               }
+       }
+}
+
+void
+ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
+{
+       struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
+       bool progress;
+
+       ctx->max_bary = -1;
+
+       /* allocate per-block data: */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               block->data = rzalloc(ctx, struct ir3_legalize_block_data);
+       }
+
+       /* process each block: */
+       do {
+               progress = false;
+               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+                       progress |= legalize_block(ctx, block);
+               }
+       } while (progress);
+
+       *num_samp = ctx->num_samp;
+       *has_ssbo = ctx->has_ssbo;
+       *max_bary = ctx->max_bary;
+
+       do {
+               ir3_count_instructions(ir);
+       } while(resolve_jumps(ir));
+
+       mark_convergence_points(ir);
+
+       ralloc_free(ctx);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c

new file mode 100644 (file)

index 0000000..70c01ee
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "util/debug.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+static const nir_shader_compiler_options options = {
+               .lower_fpow = true,
+               .lower_scmp = true,
+               .lower_flrp32 = true,
+               .lower_flrp64 = true,
+               .lower_ffract = true,
+               .lower_fmod32 = true,
+               .lower_fmod64 = true,
+               .lower_fdiv = true,
+               .lower_ldexp = true,
+               .fuse_ffma = true,
+               .native_integers = true,
+               .vertex_id_zero_based = true,
+               .lower_extract_byte = true,
+               .lower_extract_word = true,
+               .lower_all_io_to_temps = true,
+               .lower_helper_invocation = true,
+};
+
+const nir_shader_compiler_options *
+ir3_get_compiler_options(struct ir3_compiler *compiler)
+{
+       return &options;
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+       return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+                       key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+                       key->ucp_enables | key->color_two_side |
+                       key->fclamp_color | key->vclamp_color;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir3_optimize_loop(nir_shader *s)
+{
+       bool progress;
+       do {
+               progress = false;
+
+               OPT_V(s, nir_lower_vars_to_ssa);
+               progress |= OPT(s, nir_opt_copy_prop_vars);
+               progress |= OPT(s, nir_opt_dead_write_vars);
+               progress |= OPT(s, nir_lower_alu_to_scalar);
+               progress |= OPT(s, nir_lower_phis_to_scalar);
+
+               progress |= OPT(s, nir_copy_prop);
+               progress |= OPT(s, nir_opt_dce);
+               progress |= OPT(s, nir_opt_cse);
+               static int gcm = -1;
+               if (gcm == -1)
+                       gcm = env_var_as_unsigned("GCM", 0);
+               if (gcm == 1)
+                       progress |= OPT(s, nir_opt_gcm, true);
+               else if (gcm == 2)
+                       progress |= OPT(s, nir_opt_gcm, false);
+               progress |= OPT(s, nir_opt_peephole_select, 16);
+               progress |= OPT(s, nir_opt_intrinsics);
+               progress |= OPT(s, nir_opt_algebraic);
+               progress |= OPT(s, nir_opt_constant_folding);
+               progress |= OPT(s, nir_opt_dead_cf);
+               if (OPT(s, nir_opt_trivial_continues)) {
+                       progress |= true;
+                       /* If nir_opt_trivial_continues makes progress, then we need to clean
+                        * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+                        * to make progress.
+                        */
+                       OPT(s, nir_copy_prop);
+                       OPT(s, nir_opt_dce);
+               }
+               progress |= OPT(s, nir_opt_if);
+               progress |= OPT(s, nir_opt_remove_phis);
+               progress |= OPT(s, nir_opt_undef);
+
+       } while (progress);
+}
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+               const struct ir3_shader_key *key)
+{
+       struct nir_lower_tex_options tex_options = {
+                       .lower_rect = 0,
+       };
+
+       if (key) {
+               switch (shader->type) {
+               case MESA_SHADER_FRAGMENT:
+                       tex_options.saturate_s = key->fsaturate_s;
+                       tex_options.saturate_t = key->fsaturate_t;
+                       tex_options.saturate_r = key->fsaturate_r;
+                       break;
+               case MESA_SHADER_VERTEX:
+                       tex_options.saturate_s = key->vsaturate_s;
+                       tex_options.saturate_t = key->vsaturate_t;
+                       tex_options.saturate_r = key->vsaturate_r;
+                       break;
+               default:
+                       /* TODO */
+                       break;
+               }
+       }
+
+       if (shader->compiler->gpu_id >= 400) {
+               /* a4xx seems to have *no* sam.p */
+               tex_options.lower_txp = ~0;  /* lower all txp */
+       } else {
+               /* a3xx just needs to avoid sam.p for 3d tex */
+               tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+       }
+
+       if (ir3_shader_debug & IR3_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(s, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       OPT_V(s, nir_opt_global_to_local);
+       OPT_V(s, nir_lower_regs_to_ssa);
+
+       if (key) {
+               if (s->info.stage == MESA_SHADER_VERTEX) {
+                       OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
+                       if (key->vclamp_color)
+                               OPT_V(s, nir_lower_clamp_color_outputs);
+               } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
+                       OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+                       if (key->fclamp_color)
+                               OPT_V(s, nir_lower_clamp_color_outputs);
+               }
+               if (key->color_two_side) {
+                       OPT_V(s, nir_lower_two_sided_color);
+               }
+       } else {
+               /* only want to do this the first time (when key is null)
+                * and not again on any potential 2nd variant lowering pass:
+                */
+               OPT_V(s, ir3_nir_apply_trig_workarounds);
+       }
+
+       OPT_V(s, nir_lower_tex, &tex_options);
+       OPT_V(s, nir_lower_load_const_to_scalar);
+       if (shader->compiler->gpu_id < 500)
+               OPT_V(s, ir3_nir_lower_tg4_to_tex);
+
+       ir3_optimize_loop(s);
+
+       /* do idiv lowering after first opt loop to give a chance for
+        * divide by immed power-of-two to be caught first:
+        */
+       if (OPT(s, nir_lower_idiv))
+               ir3_optimize_loop(s);
+
+       OPT_V(s, nir_remove_dead_variables, nir_var_local);
+
+       OPT_V(s, nir_move_load_const);
+
+       if (ir3_shader_debug & IR3_DBG_DISASM) {
+               debug_printf("----------------------\n");
+               nir_print_shader(s, stdout);
+               debug_printf("----------------------\n");
+       }
+
+       nir_sweep(s);
+
+       return s;
+}
+
+void
+ir3_nir_scan_driver_consts(nir_shader *shader,
+               struct ir3_driver_const_layout *layout)
+{
+       nir_foreach_function(function, shader) {
+               if (!function->impl)
+                       continue;
+
+               nir_foreach_block(block, function->impl) {
+                       nir_foreach_instr(instr, block) {
+                               if (instr->type != nir_instr_type_intrinsic)
+                                       continue;
+
+                               nir_intrinsic_instr *intr =
+                                       nir_instr_as_intrinsic(instr);
+                               unsigned idx;
+
+                               switch (intr->intrinsic) {
+                               case nir_intrinsic_get_buffer_size:
+                                       idx = nir_src_as_const_value(intr->src[0])->u32[0];
+                                       if (layout->ssbo_size.mask & (1 << idx))
+                                               break;
+                                       layout->ssbo_size.mask |= (1 << idx);
+                                       layout->ssbo_size.off[idx] =
+                                               layout->ssbo_size.count;
+                                       layout->ssbo_size.count += 1; /* one const per */
+                                       break;
+                               case nir_intrinsic_image_deref_atomic_add:
+                               case nir_intrinsic_image_deref_atomic_min:
+                               case nir_intrinsic_image_deref_atomic_max:
+                               case nir_intrinsic_image_deref_atomic_and:
+                               case nir_intrinsic_image_deref_atomic_or:
+                               case nir_intrinsic_image_deref_atomic_xor:
+                               case nir_intrinsic_image_deref_atomic_exchange:
+                               case nir_intrinsic_image_deref_atomic_comp_swap:
+                               case nir_intrinsic_image_deref_store:
+                               case nir_intrinsic_image_deref_size:
+                                       idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
+                                       if (layout->image_dims.mask & (1 << idx))
+                                               break;
+                                       layout->image_dims.mask |= (1 << idx);
+                                       layout->image_dims.off[idx] =
+                                               layout->image_dims.count;
+                                       layout->image_dims.count += 3; /* three const per */
+                                       break;
+                               default:
+                                       break;
+                               }
+                       }
+               }
+       }
+}
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h

new file mode 100644 (file)

index 0000000..74201d3
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IR3_NIR_H_
+#define IR3_NIR_H_
+
+#include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
+
+#include "ir3_shader.h"
+
+void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
+
+bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
+bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+
+const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+               const struct ir3_shader_key *key);
+
+#endif /* IR3_NIR_H_ */
diff --git a/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c

new file mode 100644 (file)

index 0000000..37a3dcb
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright © 2017 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
+ * gather results, rather than before. As a result, it must be emulated with
+ * direct texture calls.
+ */
+
+static bool
+lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+       bool progress = false;
+
+       static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
+
+       nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_tex)
+                       continue;
+
+               nir_tex_instr *tg4 = (nir_tex_instr *)instr;
+
+               if (tg4->op != nir_texop_tg4)
+                       continue;
+
+               b->cursor = nir_before_instr(&tg4->instr);
+
+               nir_ssa_def *results[4];
+               int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
+               for (int i = 0; i < 4; i++) {
+                       int num_srcs = tg4->num_srcs + 1 /* lod */;
+                       if (offset_index < 0 && i < 3)
+                               num_srcs++;
+
+                       nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
+                       tex->op = nir_texop_txl;
+                       tex->sampler_dim = tg4->sampler_dim;
+                       tex->coord_components = tg4->coord_components;
+                       tex->is_array = tg4->is_array;
+                       tex->is_shadow = tg4->is_shadow;
+                       tex->is_new_style_shadow = tg4->is_new_style_shadow;
+                       tex->texture_index = tg4->texture_index;
+                       tex->sampler_index = tg4->sampler_index;
+                       tex->dest_type = tg4->dest_type;
+
+                       for (int j = 0; j < tg4->num_srcs; j++) {
+                               nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+                               tex->src[j].src_type = tg4->src[j].src_type;
+                       }
+                       if (i != 3) {
+                               nir_ssa_def *offset =
+                                       nir_vec2(b, nir_imm_int(b, offsets[i][0]),
+                                                        nir_imm_int(b, offsets[i][1]));
+                               if (offset_index < 0) {
+                                       tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
+                                       tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
+                               } else {
+                                       assert(nir_tex_instr_src_size(tex, offset_index) == 2);
+                                       nir_ssa_def *orig = nir_ssa_for_src(
+                                                       b, tex->src[offset_index].src, 2);
+                                       tex->src[offset_index].src =
+                                               nir_src_for_ssa(nir_iadd(b, orig, offset));
+                               }
+                       }
+                       tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
+                       tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
+
+                       nir_ssa_dest_init(&tex->instr, &tex->dest,
+                                                         nir_tex_instr_dest_size(tex), 32, NULL);
+                       nir_builder_instr_insert(b, &tex->instr);
+
+                       results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
+               }
+
+               nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
+               nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
+
+               nir_instr_remove(&tg4->instr);
+
+               progress = true;
+       }
+
+       return progress;
+}
+
+static bool
+lower_tg4_func(nir_function_impl *impl)
+{
+       void *mem_ctx = ralloc_parent(impl);
+       nir_builder b;
+       nir_builder_init(&b, impl);
+
+       bool progress = false;
+       nir_foreach_block_safe(block, impl) {
+               progress |= lower_tg4(block, &b, mem_ctx);
+       }
+
+       if (progress)
+               nir_metadata_preserve(impl, nir_metadata_block_index |
+                                                                       nir_metadata_dominance);
+
+       return progress;
+}
+
+bool
+ir3_nir_lower_tg4_to_tex(nir_shader *shader)
+{
+       bool progress = false;
+
+       nir_foreach_function(function, shader) {
+               if (function->impl)
+                       progress |= lower_tg4_func(function->impl);
+       }
+
+       return progress;
+}
diff --git a/src/freedreno/ir3/ir3_nir_trig.py b/src/freedreno/ir3/ir3_nir_trig.py

new file mode 100644 (file)

index 0000000..3968aea
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_trig.py
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+trig_workarounds = [
+   (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+   (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
+]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "ir3_nir.h"')
+    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
+                                      trig_workarounds).render())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c

new file mode 100644 (file)

index 0000000..b6ef6e4
--- /dev/null
+++ b/src/freedreno/ir3/ir3_print.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+       if (!instr)
+               return;
+#ifdef DEBUG
+       printf("%04u:", instr->serialno);
+#endif
+       printf("%04u:", instr->name);
+       printf("%04u:", instr->ip);
+       printf("%03u: ", instr->depth);
+
+       if (instr->flags & IR3_INSTR_SY)
+               printf("(sy)");
+       if (instr->flags & IR3_INSTR_SS)
+               printf("(ss)");
+
+       if (is_meta(instr)) {
+               switch (instr->opc) {
+               case OPC_META_INPUT:  printf("_meta:in");   break;
+               case OPC_META_FO:     printf("_meta:fo");   break;
+               case OPC_META_FI:     printf("_meta:fi");   break;
+
+               /* shouldn't hit here.. just for debugging: */
+               default: printf("_meta:%d", instr->opc);    break;
+               }
+       } else if (instr->opc == OPC_MOV) {
+               static const char *type[] = {
+                               [TYPE_F16] = "f16",
+                               [TYPE_F32] = "f32",
+                               [TYPE_U16] = "u16",
+                               [TYPE_U32] = "u32",
+                               [TYPE_S16] = "s16",
+                               [TYPE_S32] = "s32",
+                               [TYPE_U8]  = "u8",
+                               [TYPE_S8]  = "s8",
+               };
+               if (instr->cat1.src_type == instr->cat1.dst_type)
+                       printf("mov");
+               else
+                       printf("cov");
+               printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+       } else {
+               printf("%s", ir3_instr_name(instr));
+               if (instr->flags & IR3_INSTR_3D)
+                       printf(".3d");
+               if (instr->flags & IR3_INSTR_A)
+                       printf(".a");
+               if (instr->flags & IR3_INSTR_O)
+                       printf(".o");
+               if (instr->flags & IR3_INSTR_P)
+                       printf(".p");
+               if (instr->flags & IR3_INSTR_S)
+                       printf(".s");
+               if (instr->flags & IR3_INSTR_S2EN)
+                       printf(".s2en");
+       }
+}
+
+static void print_reg_name(struct ir3_register *reg)
+{
+       if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+                       (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+               printf("(absneg)");
+       else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+               printf("(neg)");
+       else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+               printf("(abs)");
+
+       if (reg->flags & IR3_REG_IMMED) {
+               printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+       } else if (reg->flags & IR3_REG_ARRAY) {
+               printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+                               reg->array.offset, reg->size);
+               /* for ARRAY we could have null src, for example first write
+                * instruction..
+                */
+               if (reg->instr) {
+                       printf(", _[");
+                       print_instr_name(reg->instr);
+                       printf("]");
+               }
+               printf("]");
+       } else if (reg->flags & IR3_REG_SSA) {
+               printf("_[");
+               print_instr_name(reg->instr);
+               printf("]");
+       } else if (reg->flags & IR3_REG_RELATIV) {
+               if (reg->flags & IR3_REG_HALF)
+                       printf("h");
+               if (reg->flags & IR3_REG_CONST)
+                       printf("c<a0.x + %d>", reg->array.offset);
+               else
+                       printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
+       } else {
+               if (reg->flags & IR3_REG_HALF)
+                       printf("h");
+               if (reg->flags & IR3_REG_CONST)
+                       printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+               else
+                       printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+       }
+}
+
+static void
+tab(int lvl)
+{
+       for (int i = 0; i < lvl; i++)
+               printf("\t");
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+       unsigned i;
+
+       tab(lvl);
+
+       print_instr_name(instr);
+       for (i = 0; i < instr->regs_count; i++) {
+               struct ir3_register *reg = instr->regs[i];
+               printf(i ? ", " : " ");
+               print_reg_name(reg);
+       }
+
+       if (instr->address) {
+               printf(", address=_");
+               printf("[");
+               print_instr_name(instr->address);
+               printf("]");
+       }
+
+       if (instr->cp.left) {
+               printf(", left=_");
+               printf("[");
+               print_instr_name(instr->cp.left);
+               printf("]");
+       }
+
+       if (instr->cp.right) {
+               printf(", right=_");
+               printf("[");
+               print_instr_name(instr->cp.right);
+               printf("]");
+       }
+
+       if (instr->opc == OPC_META_FO) {
+               printf(", off=%d", instr->fo.off);
+       }
+
+       if (is_flow(instr) && instr->cat0.target) {
+               /* the predicate register src is implied: */
+               if (instr->opc == OPC_BR) {
+                       printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+               }
+               printf(", target=block%u", block_id(instr->cat0.target));
+       }
+
+       if (instr->deps_count) {
+               printf(", false-deps:");
+               for (unsigned i = 0; i < instr->deps_count; i++) {
+                       if (i > 0)
+                               printf(", ");
+                       printf("_[");
+                       print_instr_name(instr->deps[i]);
+                       printf("]");
+               }
+       }
+
+       printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+       print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+       tab(lvl); printf("block%u {\n", block_id(block));
+
+       if (block->predecessors_count > 0) {
+               tab(lvl+1);
+               printf("pred: ");
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       if (i)
+                               printf(", ");
+                       printf("block%u", block_id(block->predecessors[i]));
+               }
+               printf("\n");
+       }
+
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               print_instr(instr, lvl+1);
+       }
+
+       tab(lvl+1); printf("/* keeps:\n");
+       for (unsigned i = 0; i < block->keeps_count; i++) {
+               print_instr(block->keeps[i], lvl+2);
+       }
+       tab(lvl+1); printf(" */\n");
+
+       if (block->successors[1]) {
+               /* leading into if/else: */
+               tab(lvl+1);
+               printf("/* succs: if _[");
+               print_instr_name(block->condition);
+               printf("] block%u; else block%u; */\n",
+                               block_id(block->successors[0]),
+                               block_id(block->successors[1]));
+       } else if (block->successors[0]) {
+               tab(lvl+1);
+               printf("/* succs: block%u; */\n",
+                               block_id(block->successors[0]));
+       }
+       tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+               print_block(block, 0);
+
+       for (unsigned i = 0; i < ir->noutputs; i++) {
+               if (!ir->outputs[i])
+                       continue;
+               printf("out%d: ", i);
+               print_instr(ir->outputs[i], 0);
+       }
+}
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c

new file mode 100644 (file)

index 0000000..ad09c40
--- /dev/null
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
+
+#include "ir3.h"
+#include "ir3_compiler.h"
+
+/*
+ * Register Assignment:
+ *
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
+ *
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
+ *
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the variable is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
+ *
+ * TODO arrays that fit in one of the pre-defined class sizes should
+ * not need to be pre-colored, but instead could be given a normal
+ * vreg name.  (Ignoring this for now since it is a good way to work
+ * out the kinks with arbitrary sized arrays.)
+ *
+ * TODO might be easier for debugging to split this into two passes,
+ * the first assigning vreg names in a way that we could ir3_print()
+ * the result.
+ */
+
+static const unsigned class_sizes[] = {
+       1, 2, 3, 4,
+       4 + 4, /* txd + 1d/2d */
+       4 + 6, /* txd + 3d */
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+       1, 2, 3, 4,
+};
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
+
+/* seems to just be used for compute shaders?  Seems like vec1 and vec3
+ * are sufficient (for now?)
+ */
+static const unsigned high_class_sizes[] = {
+       1, 3,
+};
+#define high_class_count ARRAY_SIZE(high_class_sizes)
+
+#define total_class_count (class_count + half_class_count + high_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * 48)  /* r0 to r47 */
+#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
+#define FIRST_HIGH_REG       (4 * 48)
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
+
+#define HALF_OFFSET          (class_count)
+#define HIGH_OFFSET          (class_count + half_class_count)
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+       struct ra_regs *regs;
+       unsigned int classes[class_count];
+       unsigned int half_classes[half_class_count];
+       unsigned int high_classes[high_class_count];
+       /* maps flat virtual register space to base gpr: */
+       uint16_t *ra_reg_to_gpr;
+       /* maps cls,gpr to flat virtual register space: */
+       uint16_t **gpr_to_ra_reg;
+};
+
+static void
+build_q_values(unsigned int **q_values, unsigned off,
+               const unsigned *sizes, unsigned count)
+{
+       for (unsigned i = 0; i < count; i++) {
+               q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
+
+               /* From register_allocate.c:
+                *
+                * q(B,C) (indexed by C, B is this register class) in
+                * Runeson/Nyström paper.  This is "how many registers of B could
+                * the worst choice register from C conflict with".
+                *
+                * If we just let the register allocation algorithm compute these
+                * values, is extremely expensive.  However, since all of our
+                * registers are laid out, we can very easily compute them
+                * ourselves.  View the register from C as fixed starting at GRF n
+                * somewhere in the middle, and the register from B as sliding back
+                * and forth.  Then the first register to conflict from B is the
+                * one starting at n - class_size[B] + 1 and the last register to
+                * conflict will start at n + class_size[B] - 1.  Therefore, the
+                * number of conflicts from B is class_size[B] + class_size[C] - 1.
+                *
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                * B | | | | | |n| --> | | | | | | |
+                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+                *             +-+-+-+-+-+
+                * C           |n| | | | |
+                *             +-+-+-+-+-+
+                *
+                * (Idea copied from brw_fs_reg_allocate.cpp)
+                */
+               for (unsigned j = 0; j < count; j++)
+                       q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
+       }
+}
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
+ */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
+{
+       struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
+       unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
+       unsigned int **q_values;
+
+       /* calculate # of regs across all classes: */
+       ra_reg_count = 0;
+       for (unsigned i = 0; i < class_count; i++)
+               ra_reg_count += CLASS_REGS(i);
+       for (unsigned i = 0; i < half_class_count; i++)
+               ra_reg_count += HALF_CLASS_REGS(i);
+       for (unsigned i = 0; i < high_class_count; i++)
+               ra_reg_count += HIGH_CLASS_REGS(i);
+
+       /* allocate and populate q_values: */
+       q_values = ralloc_array(set, unsigned *, total_class_count);
+
+       build_q_values(q_values, 0, class_sizes, class_count);
+       build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
+       build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
+
+       /* allocate the reg-set.. */
+       set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
+       set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+       set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+       /* .. and classes */
+       reg = 0;
+       for (unsigned i = 0; i < class_count; i++) {
+               set->classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+               for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[i][j] = reg;
+
+                       for (unsigned br = j; br < j + class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+                       reg++;
+               }
+       }
+
+       first_half_reg = reg;
+       base = HALF_OFFSET;
+
+       for (unsigned i = 0; i < half_class_count; i++) {
+               set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
+
+               for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
+
+                       for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+                       reg++;
+               }
+       }
+
+       first_high_reg = reg;
+       base = HIGH_OFFSET;
+
+       for (unsigned i = 0; i < high_class_count; i++) {
+               set->high_classes[i] = ra_alloc_reg_class(set->regs);
+
+               set->gpr_to_ra_reg[base + i] =
+                               ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
+
+               for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
+                       ra_class_add_reg(set->regs, set->high_classes[i], reg);
+
+                       set->ra_reg_to_gpr[reg] = j;
+                       set->gpr_to_ra_reg[base + i][j] = reg;
+
+                       for (unsigned br = j; br < j + high_class_sizes[i]; br++)
+                               ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
+
+                       reg++;
+               }
+       }
+
+       /* starting a6xx, half precision regs conflict w/ full precision regs: */
+       if (compiler->gpu_id >= 600) {
+               /* because of transitivity, we can get away with just setting up
+                * conflicts between the first class of full and half regs:
+                */
+               for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
+                       unsigned freg  = set->gpr_to_ra_reg[0][j];
+                       unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
+                       unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
+
+                       ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
+                       ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
+               }
+
+               // TODO also need to update q_values, but for now:
+               ra_set_finalize(set->regs, NULL);
+       } else {
+               ra_set_finalize(set->regs, q_values);
+       }
+
+       ralloc_free(q_values);
+
+       return set;
+}
+
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+       BITSET_WORD *def;        /* variables defined before used in block */
+       BITSET_WORD *use;        /* variables used before defined in block */
+       BITSET_WORD *livein;     /* which defs reach entry point of block */
+       BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+/* additional instruction-data (per-instruction) */
+struct ir3_ra_instr_data {
+       /* cached instruction 'definer' info: */
+       struct ir3_instruction *defn;
+       int off, sz, cls;
+};
+
+/* register-assign context, per-shader */
+struct ir3_ra_ctx {
+       struct ir3 *ir;
+       gl_shader_stage type;
+       bool frag_face;
+
+       struct ir3_ra_reg_set *set;
+       struct ra_graph *g;
+       unsigned alloc_count;
+       /* one per class, plus one slot for arrays: */
+       unsigned class_alloc_count[total_class_count + 1];
+       unsigned class_base[total_class_count + 1];
+       unsigned instr_cnt;
+       unsigned *def, *use;     /* def/use table */
+       struct ir3_ra_instr_data *instrd;
+};
+
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+       return !((a_start >= b_end) || (b_start >= a_end));
+}
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
+
+static bool
+is_high(struct ir3_instruction *instr)
+{
+       return !!(instr->regs[0]->flags & IR3_REG_HIGH);
+}
+
+static int
+size_to_class(unsigned sz, bool half, bool high)
+{
+       if (high) {
+               for (unsigned i = 0; i < high_class_count; i++)
+                       if (high_class_sizes[i] >= sz)
+                               return i + HIGH_OFFSET;
+       } else if (half) {
+               for (unsigned i = 0; i < half_class_count; i++)
+                       if (half_class_sizes[i] >= sz)
+                               return i + HALF_OFFSET;
+       } else {
+               for (unsigned i = 0; i < class_count; i++)
+                       if (class_sizes[i] >= sz)
+                               return i;
+       }
+       debug_assert(0);
+       return -1;
+}
+
+static bool
+writes_gpr(struct ir3_instruction *instr)
+{
+       if (is_store(instr))
+               return false;
+       /* is dest a normal temp register: */
+       struct ir3_register *reg = instr->regs[0];
+       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+               return false;
+       if ((reg->num == regid(REG_A0, 0)) ||
+                       (reg->num == regid(REG_P0, 0)))
+               return false;
+       return true;
+}
+
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+       if (a->flags & IR3_INSTR_UNUSED)
+               return false;
+       return (a->ip < b->ip);
+}
+
+static struct ir3_instruction *
+get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
+               int *sz, int *off)
+{
+       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+       struct ir3_instruction *d = NULL;
+
+       if (id->defn) {
+               *sz = id->sz;
+               *off = id->off;
+               return id->defn;
+       }
+
+       if (instr->opc == OPC_META_FI) {
+               /* What about the case where collect is subset of array, we
+                * need to find the distance between where actual array starts
+                * and fanin..  that probably doesn't happen currently.
+                */
+               struct ir3_register *src;
+               int dsz, doff;
+
+               /* note: don't use foreach_ssa_src as this gets called once
+                * while assigning regs (which clears SSA flag)
+                */
+               foreach_src_n(src, n, instr) {
+                       struct ir3_instruction *dd;
+                       if (!src->instr)
+                               continue;
+
+                       dd = get_definer(ctx, src->instr, &dsz, &doff);
+
+                       if ((!d) || instr_before(dd, d)) {
+                               d = dd;
+                               *sz = dsz;
+                               *off = doff - n;
+                       }
+               }
+
+       } else if (instr->cp.right || instr->cp.left) {
+               /* covers also the meta:fo case, which ends up w/ single
+                * scalar instructions for each component:
+                */
+               struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+               /* by definition, the entire sequence forms one linked list
+                * of single scalar register nodes (even if some of them may
+                * be fanouts from a texture sample (for example) instr.  We
+                * just need to walk the list finding the first element of
+                * the group defined (lowest ip)
+                */
+               int cnt = 0;
+
+               /* need to skip over unused in the group: */
+               while (f && (f->flags & IR3_INSTR_UNUSED)) {
+                       f = f->cp.right;
+                       cnt++;
+               }
+
+               while (f) {
+                       if ((!d) || instr_before(f, d))
+                               d = f;
+                       if (f == instr)
+                               *off = cnt;
+                       f = f->cp.right;
+                       cnt++;
+               }
+
+               *sz = cnt;
+
+       } else {
+               /* second case is looking directly at the instruction which
+                * produces multiple values (eg, texture sample), rather
+                * than the fanout nodes that point back to that instruction.
+                * This isn't quite right, because it may be part of a larger
+                * group, such as:
+                *
+                *     sam (f32)(xyzw)r0.x, ...
+                *     add r1.x, ...
+                *     add r1.y, ...
+                *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+                *
+                * need to come up with a better way to handle that case.
+                */
+               if (instr->address) {
+                       *sz = instr->regs[0]->size;
+               } else {
+                       *sz = util_last_bit(instr->regs[0]->wrmask);
+               }
+               *off = 0;
+               d = instr;
+       }
+
+       if (d->opc == OPC_META_FO) {
+               struct ir3_instruction *dd;
+               int dsz, doff;
+
+               dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
+
+               /* by definition, should come before: */
+               debug_assert(instr_before(dd, d));
+
+               *sz = MAX2(*sz, dsz);
+
+               debug_assert(instr->opc == OPC_META_FO);
+               *off = MAX2(*off, instr->fo.off);
+
+               d = dd;
+       }
+
+       id->defn = d;
+       id->sz = *sz;
+       id->off = *off;
+
+       return d;
+}
+
+static void
+ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+               if (instr->regs_count == 0)
+                       continue;
+               /* couple special cases: */
+               if (writes_addr(instr) || writes_pred(instr)) {
+                       id->cls = -1;
+               } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+                       id->cls = total_class_count;
+               } else {
+                       id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+                       id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
+               }
+       }
+}
+
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+
+#ifdef DEBUG
+               instr->name = ~0;
+#endif
+
+               ctx->instr_cnt++;
+
+               if (instr->regs_count == 0)
+                       continue;
+
+               if (!writes_gpr(instr))
+                       continue;
+
+               if (id->defn != instr)
+                       continue;
+
+               /* arrays which don't fit in one of the pre-defined class
+                * sizes are pre-colored:
+                */
+               if ((id->cls >= 0) && (id->cls < total_class_count)) {
+                       instr->name = ctx->class_alloc_count[id->cls]++;
+                       ctx->alloc_count++;
+               }
+       }
+}
+
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+       unsigned n, base;
+
+       ir3_clear_mark(ctx->ir);
+       n = ir3_count_instructions(ctx->ir);
+
+       ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_find_definers(ctx, block);
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_name_instructions(ctx, block);
+       }
+
+       /* figure out the base register name for each class.  The
+        * actual ra name is class_base[cls] + instr->name;
+        */
+       ctx->class_base[0] = 0;
+       for (unsigned i = 1; i <= total_class_count; i++) {
+               ctx->class_base[i] = ctx->class_base[i-1] +
+                               ctx->class_alloc_count[i-1];
+       }
+
+       /* and vreg names for array elements: */
+       base = ctx->class_base[total_class_count];
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               arr->base = base;
+               ctx->class_alloc_count[total_class_count] += arr->length;
+               base += arr->length;
+       }
+       ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
+       ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+       ralloc_steal(ctx->g, ctx->instrd);
+       ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+       ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+       unsigned name;
+       debug_assert(cls >= 0);
+       debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
+       name = ctx->class_base[cls] + defn->name;
+       debug_assert(name < ctx->alloc_count);
+       return name;
+}
+
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+       /* TODO handle name mapping for arrays */
+       return __ra_name(ctx, id->cls, id->defn);
+}
+
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
+{
+       ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       struct ir3_ra_block_data *bd;
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+#define def(name, instr) \
+               do { \
+                       /* defined on first write: */ \
+                       if (!ctx->def[name]) \
+                               ctx->def[name] = instr->ip; \
+                       ctx->use[name] = instr->ip; \
+                       BITSET_SET(bd->def, name); \
+               } while(0);
+
+#define use(name, instr) \
+               do { \
+                       ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+                       if (!BITSET_TEST(bd->def, name)) \
+                               BITSET_SET(bd->use, name); \
+               } while(0);
+
+       bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+       bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+       bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+       block->data = bd;
+
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_instruction *src;
+               struct ir3_register *reg;
+
+               if (instr->regs_count == 0)
+                       continue;
+
+               /* There are a couple special cases to deal with here:
+                *
+                * fanout: used to split values from a higher class to a lower
+                *     class, for example split the results of a texture fetch
+                *     into individual scalar values;  We skip over these from
+                *     a 'def' perspective, and for a 'use' we walk the chain
+                *     up to the defining instruction.
+                *
+                * fanin: used to collect values from lower class and assemble
+                *     them together into a higher class, for example arguments
+                *     to texture sample instructions;  We consider these to be
+                *     defined at the earliest fanin source.
+                *
+                * Most of this is handled in the get_definer() helper.
+                *
+                * In either case, we trace the instruction back to the original
+                * definer and consider that as the def/use ip.
+                */
+
+               if (writes_gpr(instr)) {
+                       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+                       struct ir3_register *dst = instr->regs[0];
+
+                       if (dst->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, dst->array.id);
+                               unsigned i;
+
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+                               /* set the node class now.. in case we don't encounter
+                                * this array dst again.  From register_alloc algo's
+                                * perspective, these are all single/scalar regs:
+                                */
+                               for (i = 0; i < arr->length; i++) {
+                                       unsigned name = arr->base + i;
+                                       ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+                               }
+
+                               /* indirect write is treated like a write to all array
+                                * elements, since we don't know which one is actually
+                                * written:
+                                */
+                               if (dst->flags & IR3_REG_RELATIV) {
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               def(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + dst->array.offset;
+                                       def(name, instr);
+                               }
+
+                       } else if (id->defn == instr) {
+                               unsigned name = ra_name(ctx, id);
+
+                               /* since we are in SSA at this point: */
+                               debug_assert(!BITSET_TEST(bd->use, name));
+
+                               def(name, id->defn);
+
+                               if (is_high(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->high_classes[id->cls - HIGH_OFFSET]);
+                               } else if (is_half(id->defn)) {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->half_classes[id->cls - HALF_OFFSET]);
+                               } else {
+                                       ra_set_node_class(ctx->g, name,
+                                                       ctx->set->classes[id->cls]);
+                               }
+                       }
+               }
+
+               foreach_src(reg, instr) {
+                       if (reg->flags & IR3_REG_ARRAY) {
+                               struct ir3_array *arr =
+                                       ir3_lookup_array(ctx->ir, reg->array.id);
+                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
+                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+                               /* indirect read is treated like a read fromall array
+                                * elements, since we don't know which one is actually
+                                * read:
+                                */
+                               if (reg->flags & IR3_REG_RELATIV) {
+                                       unsigned i;
+                                       for (i = 0; i < arr->length; i++) {
+                                               unsigned name = arr->base + i;
+                                               use(name, instr);
+                                       }
+                               } else {
+                                       unsigned name = arr->base + reg->array.offset;
+                                       use(name, instr);
+                                       /* NOTE: arrays are not SSA so unconditionally
+                                        * set use bit:
+                                        */
+                                       BITSET_SET(bd->use, name);
+                                       debug_assert(reg->array.offset < arr->length);
+                               }
+                       } else if ((src = ssa(reg)) && writes_gpr(src)) {
+                               unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+                               use(name, instr);
+                       }
+               }
+       }
+}
+
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
+{
+       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+       bool progress = false;
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->data;
+
+               /* update livein: */
+               for (unsigned i = 0; i < bitset_words; i++) {
+                       BITSET_WORD new_livein =
+                               (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+
+                       if (new_livein & ~bd->livein[i]) {
+                               bd->livein[i] |= new_livein;
+                               progress = true;
+                       }
+               }
+
+               /* update liveout: */
+               for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+                       struct ir3_block *succ = block->successors[j];
+                       struct ir3_ra_block_data *succ_bd;
+
+                       if (!succ)
+                               continue;
+
+                       succ_bd = succ->data;
+
+                       for (unsigned i = 0; i < bitset_words; i++) {
+                               BITSET_WORD new_liveout =
+                                       (succ_bd->livein[i] & ~bd->liveout[i]);
+
+                               if (new_liveout) {
+                                       bd->liveout[i] |= new_liveout;
+                                       progress = true;
+                               }
+                       }
+               }
+       }
+
+       return progress;
+}
+
+static void
+print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
+{
+       bool first = true;
+       debug_printf("  %s:", name);
+       for (unsigned i = 0; i < cnt; i++) {
+               if (BITSET_TEST(bs, i)) {
+                       if (!first)
+                               debug_printf(",");
+                       debug_printf(" %04u", i);
+                       first = false;
+               }
+       }
+       debug_printf("\n");
+}
+
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
+{
+       struct ir3 *ir = ctx->ir;
+
+       /* initialize array live ranges: */
+       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+               arr->start_ip = ~0;
+               arr->end_ip = 0;
+       }
+
+       /* compute live ranges (use/def) on a block level, also updating
+        * block's def/use bitmasks (used below to calculate per-block
+        * livein/liveout):
+        */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ra_block_compute_live_ranges(ctx, block);
+       }
+
+       /* update per-block livein/liveout: */
+       while (ra_compute_livein_liveout(ctx)) {}
+
+       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
+               debug_printf("AFTER LIVEIN/OUT:\n");
+               ir3_print(ir);
+               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+                       struct ir3_ra_block_data *bd = block->data;
+                       debug_printf("block%u:\n", block_id(block));
+                       print_bitset("  def", bd->def, ctx->alloc_count);
+                       print_bitset("  use", bd->use, ctx->alloc_count);
+                       print_bitset("  l/i", bd->livein, ctx->alloc_count);
+                       print_bitset("  l/o", bd->liveout, ctx->alloc_count);
+               }
+               list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+                       debug_printf("array%u:\n", arr->id);
+                       debug_printf("  length:   %u\n", arr->length);
+                       debug_printf("  start_ip: %u\n", arr->start_ip);
+                       debug_printf("  end_ip:   %u\n", arr->end_ip);
+               }
+       }
+
+       /* extend start/end ranges based on livein/liveout info from cfg: */
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               struct ir3_ra_block_data *bd = block->data;
+
+               for (unsigned i = 0; i < ctx->alloc_count; i++) {
+                       if (BITSET_TEST(bd->livein, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+                       }
+
+                       if (BITSET_TEST(bd->liveout, i)) {
+                               ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+                               ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+                       }
+               }
+
+               list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+                       for (unsigned i = 0; i < arr->length; i++) {
+                               if (BITSET_TEST(bd->livein, i + arr->base)) {
+                                       arr->start_ip = MIN2(arr->start_ip, block->start_ip);
+                               }
+                               if (BITSET_TEST(bd->livein, i + arr->base)) {
+                                       arr->end_ip = MAX2(arr->end_ip, block->end_ip);
+                               }
+                       }
+               }
+       }
+
+       /* need to fix things up to keep outputs live: */
+       for (unsigned i = 0; i < ir->noutputs; i++) {
+               struct ir3_instruction *instr = ir->outputs[i];
+               unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+               ctx->use[name] = ctx->instr_cnt;
+       }
+
+       for (unsigned i = 0; i < ctx->alloc_count; i++) {
+               for (unsigned j = 0; j < ctx->alloc_count; j++) {
+                       if (intersects(ctx->def[i], ctx->use[i],
+                                       ctx->def[j], ctx->use[j])) {
+                               ra_add_node_interference(ctx->g, i, j);
+                       }
+               }
+       }
+}
+
+/* some instructions need fix-up if dst register is half precision: */
+static void fixup_half_instr_dst(struct ir3_instruction *instr)
+{
+       switch (opc_cat(instr->opc)) {
+       case 1: /* move instructions */
+               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
+               break;
+       case 3:
+               switch (instr->opc) {
+               case OPC_MAD_F32:
+                       instr->opc = OPC_MAD_F16;
+                       break;
+               case OPC_SEL_B32:
+                       instr->opc = OPC_SEL_B16;
+                       break;
+               case OPC_SEL_S32:
+                       instr->opc = OPC_SEL_S16;
+                       break;
+               case OPC_SEL_F32:
+                       instr->opc = OPC_SEL_F16;
+                       break;
+               case OPC_SAD_S32:
+                       instr->opc = OPC_SAD_S16;
+                       break;
+               /* instructions may already be fixed up: */
+               case OPC_MAD_F16:
+               case OPC_SEL_B16:
+               case OPC_SEL_S16:
+               case OPC_SEL_F16:
+               case OPC_SAD_S16:
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+               break;
+       case 5:
+               instr->cat5.type = half_type(instr->cat5.type);
+               break;
+       }
+}
+/* some instructions need fix-up if src register is half precision: */
+static void fixup_half_instr_src(struct ir3_instruction *instr)
+{
+       switch (instr->opc) {
+       case OPC_MOV:
+               instr->cat1.src_type = half_type(instr->cat1.src_type);
+               break;
+       default:
+               break;
+       }
+}
+
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+               struct ir3_instruction *instr)
+{
+       struct ir3_ra_instr_data *id;
+
+       if (reg->flags & IR3_REG_ARRAY) {
+               struct ir3_array *arr =
+                       ir3_lookup_array(ctx->ir, reg->array.id);
+               unsigned name = arr->base + reg->array.offset;
+               unsigned r = ra_get_node_reg(ctx->g, name);
+               unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+               if (reg->flags & IR3_REG_RELATIV) {
+                       reg->array.offset = num;
+               } else {
+                       reg->num = num;
+                       reg->flags &= ~IR3_REG_SSA;
+               }
+
+               reg->flags &= ~IR3_REG_ARRAY;
+       } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+               unsigned name = ra_name(ctx, id);
+               unsigned r = ra_get_node_reg(ctx->g, name);
+               unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+
+               debug_assert(!(reg->flags & IR3_REG_RELATIV));
+
+               if (is_high(id->defn))
+                       num += FIRST_HIGH_REG;
+
+               reg->num = num;
+               reg->flags &= ~IR3_REG_SSA;
+
+               if (is_half(id->defn))
+                       reg->flags |= IR3_REG_HALF;
+       }
+}
+
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               struct ir3_register *reg;
+
+               if (instr->regs_count == 0)
+                       continue;
+
+               if (writes_gpr(instr)) {
+                       reg_assign(ctx, instr->regs[0], instr);
+                       if (instr->regs[0]->flags & IR3_REG_HALF)
+                               fixup_half_instr_dst(instr);
+               }
+
+               foreach_src_n(reg, n, instr) {
+                       struct ir3_instruction *src = reg->instr;
+                       /* Note: reg->instr could be null for IR3_REG_ARRAY */
+                       if (!(src || (reg->flags & IR3_REG_ARRAY)))
+                               continue;
+                       reg_assign(ctx, instr->regs[n+1], src);
+                       if (instr->regs[n+1]->flags & IR3_REG_HALF)
+                               fixup_half_instr_src(instr);
+               }
+       }
+}
+
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
+{
+       /* pre-assign array elements:
+        */
+       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+               unsigned base = 0;
+
+               if (arr->end_ip == 0)
+                       continue;
+
+               /* figure out what else we conflict with which has already
+                * been assigned:
+                */
+retry:
+               list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+                       if (arr2 == arr)
+                               break;
+                       if (arr2->end_ip == 0)
+                               continue;
+                       /* if it intersects with liverange AND register range.. */
+                       if (intersects(arr->start_ip, arr->end_ip,
+                                       arr2->start_ip, arr2->end_ip) &&
+                               intersects(base, base + arr->length,
+                                       arr2->reg, arr2->reg + arr2->length)) {
+                               base = MAX2(base, arr2->reg + arr2->length);
+                               goto retry;
+                       }
+               }
+
+               arr->reg = base;
+
+               for (unsigned i = 0; i < arr->length; i++) {
+                       unsigned name, reg;
+
+                       name = arr->base + i;
+                       reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+                       ra_set_node_reg(ctx->g, name, reg);
+               }
+       }
+
+       if (!ra_allocate(ctx->g))
+               return -1;
+
+       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+               ra_block_alloc(ctx, block);
+       }
+
+       return 0;
+}
+
+int ir3_ra(struct ir3 *ir, gl_shader_stage type,
+               bool frag_coord, bool frag_face)
+{
+       struct ir3_ra_ctx ctx = {
+                       .ir = ir,
+                       .type = type,
+                       .frag_face = frag_face,
+                       .set = ir->compiler->set,
+       };
+       int ret;
+
+       ra_init(&ctx);
+       ra_add_interference(&ctx);
+       ret = ra_alloc(&ctx);
+       ra_destroy(&ctx);
+
+       return ret;
+}
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c

new file mode 100644 (file)

index 0000000..6552980
--- /dev/null
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "util/u_math.h"
+
+#include "ir3.h"
+
+/*
+ * Instruction Scheduling:
+ *
+ * A recursive depth based scheduling algo.  Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions).  Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
+ *
+ * There are a few special cases that need to be handled, since sched
+ * is currently independent of register allocation.  Usages of address
+ * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
+ * if you have two pairs of instructions that write the same special
+ * register and then read it, then those pairs cannot be interleaved.
+ * To solve this, when we are in such a scheduling "critical section",
+ * and we encounter a conflicting write to a special register, we try
+ * to schedule any remaining instructions that use that value first.
+ */
+
+struct ir3_sched_ctx {
+       struct ir3_block *block;           /* the current block */
+       struct list_head depth_list;       /* depth sorted unscheduled instrs */
+       struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
+       struct ir3_instruction *addr;      /* current a0.x user, if any */
+       struct ir3_instruction *pred;      /* current p0.x user, if any */
+       bool error;
+};
+
+static bool is_sfu_or_mem(struct ir3_instruction *instr)
+{
+       return is_sfu(instr) || is_mem(instr);
+}
+
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+               if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+                       instr2->data = NULL;
+       }
+}
+
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+       debug_assert(ctx->block == instr->block);
+
+       /* maybe there is a better way to handle this than just stuffing
+        * a nop.. ideally we'd know about this constraint in the
+        * scheduling and depth calculation..
+        */
+       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
+               ir3_NOP(ctx->block);
+
+       /* remove from depth list:
+        */
+       list_delinit(&instr->node);
+
+       if (writes_addr(instr)) {
+               debug_assert(ctx->addr == NULL);
+               ctx->addr = instr;
+       }
+
+       if (writes_pred(instr)) {
+               debug_assert(ctx->pred == NULL);
+               ctx->pred = instr;
+       }
+
+       instr->flags |= IR3_INSTR_MARK;
+
+       list_addtail(&instr->node, &instr->block->instr_list);
+       ctx->scheduled = instr;
+
+       if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+               clear_cache(ctx, NULL);
+       } else {
+               /* invalidate only the necessary entries.. */
+               clear_cache(ctx, instr);
+       }
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+       struct ir3_instruction *d = NULL;
+       unsigned i = 0, id = 0;
+
+       while ((i < nsrcs) && !(d = srcs[id = i]))
+               i++;
+
+       if (!d)
+               return NULL;
+
+       for (; i < nsrcs; i++)
+               if (srcs[i] && (srcs[i]->depth > d->depth))
+                       d = srcs[id = i];
+
+       srcs[id] = NULL;
+
+       return d;
+}
+
+/**
+ * @block: the block to search in, starting from end; in first pass,
+ *    this will be the block the instruction would be inserted into
+ *    (but has not yet, ie. it only contains already scheduled
+ *    instructions).  For intra-block scheduling (second pass), this
+ *    would be one of the predecessor blocks.
+ * @instr: the instruction to search for
+ * @maxd:  max distance, bail after searching this # of instruction
+ *    slots, since it means the instruction we are looking for is
+ *    far enough away
+ * @pred:  if true, recursively search into predecessor blocks to
+ *    find the worst case (shortest) distance (only possible after
+ *    individual blocks are all scheduled
+ */
+static unsigned
+distance(struct ir3_block *block, struct ir3_instruction *instr,
+               unsigned maxd, bool pred)
+{
+       unsigned d = 0;
+
+       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+               if ((n == instr) || (d >= maxd))
+                       return d;
+               /* NOTE: don't count branch/jump since we don't know yet if they will
+                * be eliminated later in resolve_jumps().. really should do that
+                * earlier so we don't have this constraint.
+                */
+               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
+                       d++;
+       }
+
+       /* if coming from a predecessor block, assume it is assigned far
+        * enough away.. we'll fix up later.
+        */
+       if (!pred)
+               return maxd;
+
+       if (pred && (block->data != block)) {
+               /* Search into predecessor blocks, finding the one with the
+                * shortest distance, since that will be the worst case
+                */
+               unsigned min = maxd - d;
+
+               /* (ab)use block->data to prevent recursion: */
+               block->data = block;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned n;
+
+                       n = distance(block->predecessors[i], instr, min, pred);
+
+                       min = MIN2(min, n);
+               }
+
+               block->data = NULL;
+               d += min;
+       }
+
+       return d;
+}
+
+/* calculate delay for specified src: */
+static unsigned
+delay_calc_srcn(struct ir3_block *block,
+               struct ir3_instruction *assigner,
+               struct ir3_instruction *consumer,
+               unsigned srcn, bool soft, bool pred)
+{
+       unsigned delay = 0;
+
+       if (is_meta(assigner)) {
+               struct ir3_instruction *src;
+               foreach_ssa_src(src, assigner) {
+                       unsigned d;
+                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
+                       delay = MAX2(delay, d);
+               }
+       } else {
+               if (soft) {
+                       if (is_sfu(assigner)) {
+                               delay = 4;
+                       } else {
+                               delay = ir3_delayslots(assigner, consumer, srcn);
+                       }
+               } else {
+                       delay = ir3_delayslots(assigner, consumer, srcn);
+               }
+               delay -= distance(block, assigner, delay, pred);
+       }
+
+       return delay;
+}
+
+/* calculate delay for instruction (maximum of delay for all srcs): */
+static unsigned
+delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
+               bool soft, bool pred)
+{
+       unsigned delay = 0;
+       struct ir3_instruction *src;
+
+       foreach_ssa_src_n(src, i, instr) {
+               unsigned d;
+               d = delay_calc_srcn(block, src, instr, i, soft, pred);
+               delay = MAX2(delay, d);
+       }
+
+       return delay;
+}
+
+struct ir3_sched_notes {
+       /* there is at least one kill which could be scheduled, except
+        * for unscheduled bary.f's:
+        */
+       bool blocked_kill;
+       /* there is at least one instruction that could be scheduled,
+        * except for conflicting address/predicate register usage:
+        */
+       bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+       return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+       struct ir3_instruction *other_src;
+       foreach_ssa_src(other_src, instr) {
+               /* if dependency not scheduled, we aren't ready yet: */
+               if ((src != other_src) && !is_scheduled(other_src)) {
+                       return false;
+               }
+       }
+       return true;
+}
+
+/* Check if instruction is ok to schedule.  Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               struct ir3_instruction *instr)
+{
+       /* For instructions that write address register we need to
+        * make sure there is at least one instruction that uses the
+        * addr value which is otherwise ready.
+        *
+        * TODO if any instructions use pred register and have other
+        * src args, we would need to do the same for writes_pred()..
+        */
+       if (writes_addr(instr)) {
+               struct ir3 *ir = instr->block->shader;
+               bool ready = false;
+               for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+                       struct ir3_instruction *indirect = ir->indirects[i];
+                       if (!indirect)
+                               continue;
+                       if (indirect->address != instr)
+                               continue;
+                       ready = could_sched(indirect, instr);
+               }
+
+               /* nothing could be scheduled, so keep looking: */
+               if (!ready)
+                       return false;
+       }
+
+       /* if this is a write to address/predicate register, and that
+        * register is currently in use, we need to defer until it is
+        * free:
+        */
+       if (writes_addr(instr) && ctx->addr) {
+               debug_assert(ctx->addr != instr);
+               notes->addr_conflict = true;
+               return false;
+       }
+
+       if (writes_pred(instr) && ctx->pred) {
+               debug_assert(ctx->pred != instr);
+               notes->pred_conflict = true;
+               return false;
+       }
+
+       /* if the instruction is a kill, we need to ensure *every*
+        * bary.f is scheduled.  The hw seems unhappy if the thread
+        * gets killed before the end-input (ei) flag is hit.
+        *
+        * We could do this by adding each bary.f instruction as
+        * virtual ssa src for the kill instruction.  But we have
+        * fixed length instr->regs[].
+        *
+        * TODO this wouldn't be quite right if we had multiple
+        * basic blocks, if any block was conditional.  We'd need
+        * to schedule the bary.f's outside of any block which
+        * was conditional that contained a kill.. I think..
+        */
+       if (is_kill(instr)) {
+               struct ir3 *ir = instr->block->shader;
+
+               for (unsigned i = 0; i < ir->baryfs_count; i++) {
+                       struct ir3_instruction *baryf = ir->baryfs[i];
+                       if (baryf->flags & IR3_INSTR_UNUSED)
+                               continue;
+                       if (!is_scheduled(baryf)) {
+                               notes->blocked_kill = true;
+                               return false;
+                       }
+               }
+       }
+
+       return true;
+}
+
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               struct ir3_instruction *instr)
+{
+       struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+       struct ir3_instruction *src;
+       unsigned nsrcs = 0;
+
+       if (is_scheduled(instr))
+               return NULL;
+
+       /* use instr->data to cache the results of recursing up the
+        * instr src's.  Otherwise the recursive algo can scale quite
+        * badly w/ shader size.  But this takes some care to clear
+        * the cache appropriately when instructions are scheduled.
+        */
+       if (instr->data) {
+               if (instr->data == NULL_INSTR)
+                       return NULL;
+               return instr->data;
+       }
+
+       /* find unscheduled srcs: */
+       foreach_ssa_src(src, instr) {
+               if (!is_scheduled(src)) {
+                       debug_assert(nsrcs < ARRAY_SIZE(srcs));
+                       srcs[nsrcs++] = src;
+               }
+       }
+
+       /* if all our src's are already scheduled: */
+       if (nsrcs == 0) {
+               if (check_instr(ctx, notes, instr)) {
+                       instr->data = instr;
+                       return instr;
+               }
+               return NULL;
+       }
+
+       while ((src = deepest(srcs, nsrcs))) {
+               struct ir3_instruction *candidate;
+
+               candidate = find_instr_recursive(ctx, notes, src);
+               if (!candidate)
+                       continue;
+
+               if (check_instr(ctx, notes, candidate)) {
+                       instr->data = candidate;
+                       return candidate;
+               }
+       }
+
+       instr->data = NULL_INSTR;
+       return NULL;
+}
+
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+               bool soft)
+{
+       struct ir3_instruction *best_instr = NULL;
+       unsigned min_delay = ~0;
+
+       /* TODO we'd really rather use the list/array of block outputs.  But we
+        * don't have such a thing.  Recursing *every* instruction in the list
+        * will result in a lot of repeated traversal, since instructions will
+        * get traversed both when they appear as ssa src to a later instruction
+        * as well as where they appear in the depth_list.
+        */
+       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+               struct ir3_instruction *candidate;
+               unsigned delay;
+
+               candidate = find_instr_recursive(ctx, notes, instr);
+               if (!candidate)
+                       continue;
+
+               delay = delay_calc(ctx->block, candidate, soft, false);
+               if (delay < min_delay) {
+                       best_instr = candidate;
+                       min_delay = delay;
+               }
+
+               if (min_delay == 0)
+                       break;
+       }
+
+       return best_instr;
+}
+
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_addr(struct ir3_sched_ctx *ctx)
+{
+       struct ir3 *ir;
+       struct ir3_instruction *new_addr = NULL;
+       unsigned i;
+
+       debug_assert(ctx->addr);
+
+       ir = ctx->addr->block->shader;
+
+       for (i = 0; i < ir->indirects_count; i++) {
+               struct ir3_instruction *indirect = ir->indirects[i];
+
+               if (!indirect)
+                       continue;
+
+               /* skip instructions already scheduled: */
+               if (is_scheduled(indirect))
+                       continue;
+
+               /* remap remaining instructions using current addr
+                * to new addr:
+                */
+               if (indirect->address == ctx->addr) {
+                       if (!new_addr) {
+                               new_addr = ir3_instr_clone(ctx->addr);
+                               /* original addr is scheduled, but new one isn't: */
+                               new_addr->flags &= ~IR3_INSTR_MARK;
+                       }
+                       ir3_instr_set_address(indirect, new_addr);
+               }
+       }
+
+       /* all remaining indirects remapped to new addr: */
+       ctx->addr = NULL;
+
+       return new_addr;
+}
+
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static struct ir3_instruction *
+split_pred(struct ir3_sched_ctx *ctx)
+{
+       struct ir3 *ir;
+       struct ir3_instruction *new_pred = NULL;
+       unsigned i;
+
+       debug_assert(ctx->pred);
+
+       ir = ctx->pred->block->shader;
+
+       for (i = 0; i < ir->predicates_count; i++) {
+               struct ir3_instruction *predicated = ir->predicates[i];
+
+               /* skip instructions already scheduled: */
+               if (is_scheduled(predicated))
+                       continue;
+
+               /* remap remaining instructions using current pred
+                * to new pred:
+                *
+                * TODO is there ever a case when pred isn't first
+                * (and only) src?
+                */
+               if (ssa(predicated->regs[1]) == ctx->pred) {
+                       if (!new_pred) {
+                               new_pred = ir3_instr_clone(ctx->pred);
+                               /* original pred is scheduled, but new one isn't: */
+                               new_pred->flags &= ~IR3_INSTR_MARK;
+                       }
+                       predicated->regs[1]->instr = new_pred;
+               }
+       }
+
+       /* all remaining predicated remapped to new pred: */
+       ctx->pred = NULL;
+
+       return new_pred;
+}
+
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+       struct list_head unscheduled_list;
+
+       ctx->block = block;
+
+       /* addr/pred writes are per-block: */
+       ctx->addr = NULL;
+       ctx->pred = NULL;
+
+       /* move all instructions to the unscheduled list, and
+        * empty the block's instruction list (to which we will
+        * be inserting).
+        */
+       list_replace(&block->instr_list, &unscheduled_list);
+       list_inithead(&block->instr_list);
+       list_inithead(&ctx->depth_list);
+
+       /* first a pre-pass to schedule all meta:input instructions
+        * (which need to appear first so that RA knows the register is
+        * occupied), and move remaining to depth sorted list:
+        */
+       list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+               if (instr->opc == OPC_META_INPUT) {
+                       schedule(ctx, instr);
+               } else {
+                       ir3_insert_by_depth(instr, &ctx->depth_list);
+               }
+       }
+
+       while (!list_empty(&ctx->depth_list)) {
+               struct ir3_sched_notes notes = {0};
+               struct ir3_instruction *instr;
+
+               instr = find_eligible_instr(ctx, &notes, true);
+               if (!instr)
+                       instr = find_eligible_instr(ctx, &notes, false);
+
+               if (instr) {
+                       unsigned delay = delay_calc(ctx->block, instr, false, false);
+
+                       /* and if we run out of instructions that can be scheduled,
+                        * then it is time for nop's:
+                        */
+                       debug_assert(delay <= 6);
+                       while (delay > 0) {
+                               ir3_NOP(block);
+                               delay--;
+                       }
+
+                       schedule(ctx, instr);
+               } else {
+                       struct ir3_instruction *new_instr = NULL;
+
+                       /* nothing available to schedule.. if we are blocked on
+                        * address/predicate register conflict, then break the
+                        * deadlock by cloning the instruction that wrote that
+                        * reg:
+                        */
+                       if (notes.addr_conflict) {
+                               new_instr = split_addr(ctx);
+                       } else if (notes.pred_conflict) {
+                               new_instr = split_pred(ctx);
+                       } else {
+                               debug_assert(0);
+                               ctx->error = true;
+                               return;
+                       }
+
+                       if (new_instr) {
+                               /* clearing current addr/pred can change what is
+                                * available to schedule, so clear cache..
+                                */
+                               clear_cache(ctx, NULL);
+
+                               ir3_insert_by_depth(new_instr, &ctx->depth_list);
+                               /* the original instr that wrote addr/pred may have
+                                * originated from a different block:
+                                */
+                               new_instr->block = block;
+                       }
+               }
+       }
+
+       /* And lastly, insert branch/jump instructions to take us to
+        * the next block.  Later we'll strip back out the branches
+        * that simply jump to next instruction.
+        */
+       if (block->successors[1]) {
+               /* if/else, conditional branches to "then" or "else": */
+               struct ir3_instruction *br;
+               unsigned delay = 6;
+
+               debug_assert(ctx->pred);
+               debug_assert(block->condition);
+
+               delay -= distance(ctx->block, ctx->pred, delay, false);
+
+               while (delay > 0) {
+                       ir3_NOP(block);
+                       delay--;
+               }
+
+               /* create "else" branch first (since "then" block should
+                * frequently/always end up being a fall-thru):
+                */
+               br = ir3_BR(block);
+               br->cat0.inv = true;
+               br->cat0.target = block->successors[1];
+
+               /* NOTE: we have to hard code delay of 6 above, since
+                * we want to insert the nop's before constructing the
+                * branch.  Throw in an assert so we notice if this
+                * ever breaks on future generation:
+                */
+               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+               br = ir3_BR(block);
+               br->cat0.target = block->successors[0];
+
+       } else if (block->successors[0]) {
+               /* otherwise unconditional jump to next block: */
+               struct ir3_instruction *jmp;
+
+               jmp = ir3_JUMP(block);
+               jmp->cat0.target = block->successors[0];
+       }
+
+       /* NOTE: if we kept track of the predecessors, we could do a better
+        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+        * Note that as we eliminate blocks which contain only an unconditional
+        * jump we probably need to propagate (jp) flag..
+        */
+}
+
+/* After scheduling individual blocks, we still could have cases where
+ * one (or more) paths into a block, a value produced by a previous
+ * has too few delay slots to be legal.  We can't deal with this in the
+ * first pass, because loops (ie. we can't ensure all predecessor blocks
+ * are already scheduled in the first pass).  All we can really do at
+ * this point is stuff in extra nop's until things are legal.
+ */
+static void
+sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+{
+       unsigned n = 0;
+
+       ctx->block = block;
+
+       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+               unsigned delay = 0;
+
+               for (unsigned i = 0; i < block->predecessors_count; i++) {
+                       unsigned d = delay_calc(block->predecessors[i], instr, false, true);
+                       delay = MAX2(d, delay);
+               }
+
+               while (delay > n) {
+                       struct ir3_instruction *nop = ir3_NOP(block);
+
+                       /* move to before instr: */
+                       list_delinit(&nop->node);
+                       list_addtail(&nop->node, &instr->node);
+
+                       n++;
+               }
+
+               /* we can bail once we hit worst case delay: */
+               if (++n > 6)
+                       break;
+       }
+}
+
+int ir3_sched(struct ir3 *ir)
+{
+       struct ir3_sched_ctx ctx = {0};
+
+       ir3_clear_mark(ir);
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_block(&ctx, block);
+       }
+
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               sched_intra_block(&ctx, block);
+       }
+
+       if (ctx.error)
+               return -1;
+       return 0;
+}
+
+/* does instruction 'prior' need to be scheduled before 'instr'? */
+static bool
+depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
+{
+       /* TODO for dependencies that are related to a specific object, ie
+        * a specific SSBO/image/array, we could relax this constraint to
+        * make accesses to unrelated objects not depend on each other (at
+        * least as long as not declared coherent)
+        */
+       if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
+                       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
+               return true;
+       return !!(instr->barrier_class & prior->barrier_conflict);
+}
+
+static void
+add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
+{
+       struct list_head *prev = instr->node.prev;
+       struct list_head *next = instr->node.next;
+
+       /* add dependencies on previous instructions that must be scheduled
+        * prior to the current instruction
+        */
+       while (prev != &block->instr_list) {
+               struct ir3_instruction *pi =
+                       LIST_ENTRY(struct ir3_instruction, prev, node);
+
+               prev = prev->prev;
+
+               if (is_meta(pi))
+                       continue;
+
+               if (instr->barrier_class == pi->barrier_class) {
+                       ir3_instr_add_dep(instr, pi);
+                       break;
+               }
+
+               if (depends_on(instr, pi))
+                       ir3_instr_add_dep(instr, pi);
+       }
+
+       /* add dependencies on this instruction to following instructions
+        * that must be scheduled after the current instruction:
+        */
+       while (next != &block->instr_list) {
+               struct ir3_instruction *ni =
+                       LIST_ENTRY(struct ir3_instruction, next, node);
+
+               next = next->next;
+
+               if (is_meta(ni))
+                       continue;
+
+               if (instr->barrier_class == ni->barrier_class) {
+                       ir3_instr_add_dep(ni, instr);
+                       break;
+               }
+
+               if (depends_on(ni, instr))
+                       ir3_instr_add_dep(ni, instr);
+       }
+}
+
+/* before scheduling a block, we need to add any necessary false-dependencies
+ * to ensure that:
+ *
+ *  (1) barriers are scheduled in the right order wrt instructions related
+ *      to the barrier
+ *
+ *  (2) reads that come before a write actually get scheduled before the
+ *      write
+ */
+static void
+calculate_deps(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+               if (instr->barrier_class) {
+                       add_barrier_deps(block, instr);
+               }
+       }
+}
+
+void
+ir3_sched_add_deps(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               calculate_deps(block);
+       }
+}
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c

new file mode 100644 (file)

index 0000000..8b18e95
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/u_string.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+
+#include "drm/freedreno_drmif.h"
+
+#include "ir3_shader.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+int
+ir3_glsl_type_size(const struct glsl_type *type)
+{
+       return glsl_count_attribute_slots(type, false);
+}
+
+static void
+delete_variant(struct ir3_shader_variant *v)
+{
+       if (v->ir)
+               ir3_destroy(v->ir);
+       if (v->bo)
+               fd_bo_del(v->bo);
+       if (v->immediates)
+               free(v->immediates);
+       free(v);
+}
+
+/* for vertex shader, the inputs are loaded into registers before the shader
+ * is executed, so max_regs from the shader instructions might not properly
+ * reflect the # of registers actually used, especially in case passthrough
+ * varyings.
+ *
+ * Likewise, for fragment shader, we can have some regs which are passed
+ * input values but never touched by the resulting shader (ie. as result
+ * of dead code elimination or simply because we don't know how to turn
+ * the reg off.
+ */
+static void
+fixup_regfootprint(struct ir3_shader_variant *v)
+{
+       unsigned i;
+
+       for (i = 0; i < v->inputs_count; i++) {
+               /* skip frag inputs fetch via bary.f since their reg's are
+                * not written by gpu before shader starts (and in fact the
+                * regid's might not even be valid)
+                */
+               if (v->inputs[i].bary)
+                       continue;
+
+               /* ignore high regs that are global to all threads in a warp
+                * (they exist by default) (a5xx+)
+                */
+               if (v->inputs[i].regid >= regid(48,0))
+                       continue;
+
+               if (v->inputs[i].compmask) {
+                       unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
+                       int32_t regid = (v->inputs[i].regid + n) >> 2;
+                       v->info.max_reg = MAX2(v->info.max_reg, regid);
+               }
+       }
+
+       for (i = 0; i < v->outputs_count; i++) {
+               int32_t regid = (v->outputs[i].regid + 3) >> 2;
+               v->info.max_reg = MAX2(v->info.max_reg, regid);
+       }
+}
+
+/* wrapper for ir3_assemble() which does some info fixup based on
+ * shader state.  Non-static since used by ir3_cmdline too.
+ */
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
+{
+       void *bin;
+
+       bin = ir3_assemble(v->ir, &v->info, gpu_id);
+       if (!bin)
+               return NULL;
+
+       if (gpu_id >= 400) {
+               v->instrlen = v->info.sizedwords / (2 * 16);
+       } else {
+               v->instrlen = v->info.sizedwords / (2 * 4);
+       }
+
+       /* NOTE: if relative addressing is used, we set constlen in
+        * the compiler (to worst-case value) since we don't know in
+        * the assembler what the max addr reg value can be:
+        */
+       v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
+
+       fixup_regfootprint(v);
+
+       return bin;
+}
+
+static void
+assemble_variant(struct ir3_shader_variant *v)
+{
+       struct ir3_compiler *compiler = v->shader->compiler;
+       uint32_t gpu_id = compiler->gpu_id;
+       uint32_t sz, *bin;
+
+       bin = ir3_shader_assemble(v, gpu_id);
+       sz = v->info.sizedwords * 4;
+
+       v->bo = fd_bo_new(compiler->dev, sz,
+                       DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
+                       DRM_FREEDRENO_GEM_TYPE_KMEM);
+
+       memcpy(fd_bo_map(v->bo), bin, sz);
+
+       if (ir3_shader_debug & IR3_DBG_DISASM) {
+               struct ir3_shader_key key = v->key;
+               printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+                       v->binning_pass, key.color_two_side, key.half_precision);
+               ir3_shader_disasm(v, bin, stdout);
+       }
+
+       if (shader_debug_enabled(v->shader->type)) {
+               fprintf(stderr, "Native code for unnamed %s shader %s:\n",
+                       _mesa_shader_stage_to_string(v->shader->type),
+                       v->shader->nir->info.name);
+               if (v->shader->type == MESA_SHADER_FRAGMENT)
+                       fprintf(stderr, "SIMD0\n");
+               ir3_shader_disasm(v, bin, stderr);
+       }
+
+       free(bin);
+
+       /* no need to keep the ir around beyond this point: */
+       ir3_destroy(v->ir);
+       v->ir = NULL;
+}
+
+static struct ir3_shader_variant *
+create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+               bool binning_pass)
+{
+       struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
+       int ret;
+
+       if (!v)
+               return NULL;
+
+       v->id = ++shader->variant_count;
+       v->shader = shader;
+       v->binning_pass = binning_pass;
+       v->key = *key;
+       v->type = shader->type;
+
+       ret = ir3_compile_shader_nir(shader->compiler, v);
+       if (ret) {
+               debug_error("compile failed!");
+               goto fail;
+       }
+
+       assemble_variant(v);
+       if (!v->bo) {
+               debug_error("assemble failed!");
+               goto fail;
+       }
+
+       return v;
+
+fail:
+       delete_variant(v);
+       return NULL;
+}
+
+static inline struct ir3_shader_variant *
+shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+               bool *created)
+{
+       struct ir3_shader_variant *v;
+
+       *created = false;
+
+       for (v = shader->variants; v; v = v->next)
+               if (ir3_shader_key_equal(key, &v->key))
+                       return v;
+
+       /* compile new variant if it doesn't exist already: */
+       v = create_variant(shader, key, false);
+       if (v) {
+               v->next = shader->variants;
+               shader->variants = v;
+               *created = true;
+       }
+
+       return v;
+}
+
+struct ir3_shader_variant *
+ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
+               bool binning_pass, bool *created)
+{
+       struct ir3_shader_variant *v =
+                       shader_variant(shader, key, created);
+
+       if (binning_pass) {
+               if (!v->binning)
+                       v->binning = create_variant(shader, key, true);
+               return v->binning;
+       }
+
+       return v;
+}
+
+void
+ir3_shader_destroy(struct ir3_shader *shader)
+{
+       struct ir3_shader_variant *v, *t;
+       for (v = shader->variants; v; ) {
+               t = v;
+               v = v->next;
+               delete_variant(t);
+       }
+       ralloc_free(shader->nir);
+       free(shader);
+}
+
+struct ir3_shader *
+ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
+{
+       struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+
+       shader->compiler = compiler;
+       shader->id = ++shader->compiler->shader_count;
+       shader->type = nir->info.stage;
+
+       NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
+                          (nir_lower_io_options)0);
+
+       /* do first pass optimization, ignoring the key: */
+       shader->nir = ir3_optimize_nir(shader, nir, NULL);
+       if (ir3_shader_debug & IR3_DBG_DISASM) {
+               printf("dump nir%d: type=%d", shader->id, shader->type);
+               nir_print_shader(shader->nir, stdout);
+       }
+
+       return shader;
+}
+
+static void dump_reg(FILE *out, const char *name, uint32_t r)
+{
+       if (r != regid(63,0))
+               fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+}
+
+static void dump_output(FILE *out, struct ir3_shader_variant *so,
+               unsigned slot, const char *name)
+{
+       uint32_t regid;
+       regid = ir3_find_output_regid(so, slot);
+       dump_reg(out, name, regid);
+}
+
+void
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
+{
+       struct ir3 *ir = so->ir;
+       struct ir3_register *reg;
+       const char *type = ir3_shader_stage(so->shader);
+       uint8_t regid;
+       unsigned i;
+
+       for (i = 0; i < ir->ninputs; i++) {
+               if (!ir->inputs[i]) {
+                       fprintf(out, "; in%d unused\n", i);
+                       continue;
+               }
+               reg = ir->inputs[i]->regs[0];
+               regid = reg->num;
+               fprintf(out, "@in(%sr%d.%c)\tin%d\n",
+                               (reg->flags & IR3_REG_HALF) ? "h" : "",
+                               (regid >> 2), "xyzw"[regid & 0x3], i);
+       }
+
+       for (i = 0; i < ir->noutputs; i++) {
+               if (!ir->outputs[i]) {
+                       fprintf(out, "; out%d unused\n", i);
+                       continue;
+               }
+               /* kill shows up as a virtual output.. skip it! */
+               if (is_kill(ir->outputs[i]))
+                       continue;
+               reg = ir->outputs[i]->regs[0];
+               regid = reg->num;
+               fprintf(out, "@out(%sr%d.%c)\tout%d\n",
+                               (reg->flags & IR3_REG_HALF) ? "h" : "",
+                               (regid >> 2), "xyzw"[regid & 0x3], i);
+       }
+
+       for (i = 0; i < so->immediates_count; i++) {
+               fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
+               fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+                               so->immediates[i].val[0],
+                               so->immediates[i].val[1],
+                               so->immediates[i].val[2],
+                               so->immediates[i].val[3]);
+       }
+
+       disasm_a3xx(bin, so->info.sizedwords, 0, out);
+
+       switch (so->type) {
+       case MESA_SHADER_VERTEX:
+               fprintf(out, "; %s: outputs:", type);
+               for (i = 0; i < so->outputs_count; i++) {
+                       uint8_t regid = so->outputs[i].regid;
+                       fprintf(out, " r%d.%c (%s)",
+                                       (regid >> 2), "xyzw"[regid & 0x3],
+                                       gl_varying_slot_name(so->outputs[i].slot));
+               }
+               fprintf(out, "\n");
+               fprintf(out, "; %s: inputs:", type);
+               for (i = 0; i < so->inputs_count; i++) {
+                       uint8_t regid = so->inputs[i].regid;
+                       fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
+                                       (regid >> 2), "xyzw"[regid & 0x3],
+                                       so->inputs[i].compmask,
+                                       so->inputs[i].inloc,
+                                       so->inputs[i].bary);
+               }
+               fprintf(out, "\n");
+               break;
+       case MESA_SHADER_FRAGMENT:
+               fprintf(out, "; %s: outputs:", type);
+               for (i = 0; i < so->outputs_count; i++) {
+                       uint8_t regid = so->outputs[i].regid;
+                       fprintf(out, " r%d.%c (%s)",
+                                       (regid >> 2), "xyzw"[regid & 0x3],
+                                       gl_frag_result_name(so->outputs[i].slot));
+               }
+               fprintf(out, "\n");
+               fprintf(out, "; %s: inputs:", type);
+               for (i = 0; i < so->inputs_count; i++) {
+                       uint8_t regid = so->inputs[i].regid;
+                       fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
+                                       (regid >> 2), "xyzw"[regid & 0x3],
+                                       gl_varying_slot_name(so->inputs[i].slot),
+                                       so->inputs[i].compmask,
+                                       so->inputs[i].inloc,
+                                       so->inputs[i].bary);
+               }
+               fprintf(out, "\n");
+               break;
+       default:
+               /* TODO */
+               break;
+       }
+
+       /* print generic shader info: */
+       fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
+                       type, so->shader->id, so->id,
+                       so->info.instrs_count,
+                       so->info.max_half_reg + 1,
+                       so->info.max_reg + 1);
+
+       fprintf(out, "; %d const, %u constlen\n",
+                       so->info.max_const + 1,
+                       so->constlen);
+
+       fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
+
+       /* print shader type specific info: */
+       switch (so->type) {
+       case MESA_SHADER_VERTEX:
+               dump_output(out, so, VARYING_SLOT_POS, "pos");
+               dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
+               break;
+       case MESA_SHADER_FRAGMENT:
+               dump_reg(out, "pos (bary)",
+                       ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
+               dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
+               if (so->color0_mrt) {
+                       dump_output(out, so, FRAG_RESULT_COLOR, "color");
+               } else {
+                       dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+                       dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+                       dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+                       dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+                       dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+                       dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+                       dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+                       dump_output(out, so, FRAG_RESULT_DATA7, "data7");
+               }
+               /* these two are hard-coded since we don't know how to
+                * program them to anything but all 0's...
+                */
+               if (so->frag_coord)
+                       fprintf(out, "; fragcoord: r0.x\n");
+               if (so->frag_face)
+                       fprintf(out, "; fragface: hr0.x\n");
+               break;
+       default:
+               /* TODO */
+               break;
+       }
+
+       fprintf(out, "\n");
+}
+
+uint64_t
+ir3_shader_outputs(const struct ir3_shader *so)
+{
+       return so->nir->info.outputs_written;
+}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h

new file mode 100644 (file)

index 0000000..bc47160
--- /dev/null
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -0,0 +1,587 @@
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef IR3_SHADER_H_
+#define IR3_SHADER_H_
+
+#include <stdio.h>
+
+#include "compiler/shader_enums.h"
+#include "compiler/nir/nir.h"
+#include "util/bitscan.h"
+
+#include "ir3.h"
+
+struct glsl_type;
+
+/* driver param indices: */
+enum ir3_driver_param {
+       /* compute shader driver params: */
+       IR3_DP_NUM_WORK_GROUPS_X = 0,
+       IR3_DP_NUM_WORK_GROUPS_Y = 1,
+       IR3_DP_NUM_WORK_GROUPS_Z = 2,
+       IR3_DP_LOCAL_GROUP_SIZE_X = 4,
+       IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
+       IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
+       /* NOTE: gl_NumWorkGroups should be vec4 aligned because
+        * glDispatchComputeIndirect() needs to load these from
+        * the info->indirect buffer.  Keep that in mind when/if
+        * adding any addition CS driver params.
+        */
+       IR3_DP_CS_COUNT   = 8,   /* must be aligned to vec4 */
+
+       /* vertex shader driver params: */
+       IR3_DP_VTXID_BASE = 0,
+       IR3_DP_VTXCNT_MAX = 1,
+       /* user-clip-plane components, up to 8x vec4's: */
+       IR3_DP_UCP0_X     = 4,
+       /* .... */
+       IR3_DP_UCP7_W     = 35,
+       IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
+};
+
+#define IR3_MAX_SHADER_BUFFERS   32
+#define IR3_MAX_SHADER_IMAGES    32
+#define IR3_MAX_SO_BUFFERS        4
+#define IR3_MAX_SO_OUTPUTS       64
+
+/**
+ * For consts needed to pass internal values to shader which may or may not
+ * be required, rather than allocating worst-case const space, we scan the
+ * shader and allocate consts as-needed:
+ *
+ *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
+ *     for a given SSBO
+ *
+ *   + Image dimensions: needed to calculate pixel offset, but only for
+ *     images that have a image_store intrinsic
+ */
+struct ir3_driver_const_layout {
+       struct {
+               uint32_t mask;  /* bitmask of SSBOs that have get_buffer_size */
+               uint32_t count; /* number of consts allocated */
+               /* one const allocated per SSBO which has get_buffer_size,
+                * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
+                * consts:
+                */
+               uint32_t off[IR3_MAX_SHADER_BUFFERS];
+       } ssbo_size;
+
+       struct {
+               uint32_t mask;  /* bitmask of images that have image_store */
+               uint32_t count; /* number of consts allocated */
+               /* three const allocated per image which has image_store:
+                *  + cpp         (bytes per pixel)
+                *  + pitch       (y pitch)
+                *  + array_pitch (z pitch)
+                */
+               uint32_t off[IR3_MAX_SHADER_IMAGES];
+       } image_dims;
+};
+
+/**
+ * A single output for vertex transform feedback.
+ */
+struct ir3_stream_output {
+       unsigned register_index:6;  /**< 0 to 63 (OUT index) */
+       unsigned start_component:2; /** 0 to 3 */
+       unsigned num_components:3;  /** 1 to 4 */
+       unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
+       unsigned dst_offset:16;     /**< offset into the buffer in dwords */
+       unsigned stream:2;          /**< 0 to 3 */
+};
+
+/**
+ * Stream output for vertex transform feedback.
+ */
+struct ir3_stream_output_info {
+       unsigned num_outputs;
+       /** stride for an entire vertex for each buffer in dwords */
+       uint16_t stride[IR3_MAX_SO_BUFFERS];
+
+       /**
+        * Array of stream outputs, in the order they are to be written in.
+        * Selected components are tightly packed into the output buffer.
+        */
+       struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
+};
+
+/* Configuration key used to identify a shader variant.. different
+ * shader variants can be used to implement features not supported
+ * in hw (two sided color), binning-pass vertex shader, etc.
+ */
+struct ir3_shader_key {
+       union {
+               struct {
+                       /*
+                        * Combined Vertex/Fragment shader parameters:
+                        */
+                       unsigned ucp_enables : 8;
+
+                       /* do we need to check {v,f}saturate_{s,t,r}? */
+                       unsigned has_per_samp : 1;
+
+                       /*
+                        * Vertex shader variant parameters:
+                        */
+                       unsigned vclamp_color : 1;
+
+                       /*
+                        * Fragment shader variant parameters:
+                        */
+                       unsigned color_two_side : 1;
+                       unsigned half_precision : 1;
+                       /* used when shader needs to handle flat varyings (a4xx)
+                        * for front/back color inputs to frag shader:
+                        */
+                       unsigned rasterflat : 1;
+                       unsigned fclamp_color : 1;
+               };
+               uint32_t global;
+       };
+
+       /* bitmask of sampler which needs coords clamped for vertex
+        * shader:
+        */
+       uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
+
+       /* bitmask of sampler which needs coords clamped for frag
+        * shader:
+        */
+       uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
+
+       /* bitmask of ms shifts */
+       uint32_t vsamples, fsamples;
+
+       /* bitmask of samplers which need astc srgb workaround: */
+       uint16_t vastc_srgb, fastc_srgb;
+};
+
+static inline bool
+ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
+{
+       /* slow-path if we need to check {v,f}saturate_{s,t,r} */
+       if (a->has_per_samp || b->has_per_samp)
+               return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
+       return a->global == b->global;
+}
+
+/* will the two keys produce different lowering for a fragment shader? */
+static inline bool
+ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+       if (last_key->has_per_samp || key->has_per_samp) {
+               if ((last_key->fsaturate_s != key->fsaturate_s) ||
+                               (last_key->fsaturate_t != key->fsaturate_t) ||
+                               (last_key->fsaturate_r != key->fsaturate_r) ||
+                               (last_key->fsamples != key->fsamples) ||
+                               (last_key->fastc_srgb != key->fastc_srgb))
+                       return true;
+       }
+
+       if (last_key->fclamp_color != key->fclamp_color)
+               return true;
+
+       if (last_key->color_two_side != key->color_two_side)
+               return true;
+
+       if (last_key->half_precision != key->half_precision)
+               return true;
+
+       if (last_key->rasterflat != key->rasterflat)
+               return true;
+
+       if (last_key->ucp_enables != key->ucp_enables)
+               return true;
+
+       return false;
+}
+
+/* will the two keys produce different lowering for a vertex shader? */
+static inline bool
+ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
+{
+       if (last_key->has_per_samp || key->has_per_samp) {
+               if ((last_key->vsaturate_s != key->vsaturate_s) ||
+                               (last_key->vsaturate_t != key->vsaturate_t) ||
+                               (last_key->vsaturate_r != key->vsaturate_r) ||
+                               (last_key->vsamples != key->vsamples) ||
+                               (last_key->vastc_srgb != key->vastc_srgb))
+                       return true;
+       }
+
+       if (last_key->vclamp_color != key->vclamp_color)
+               return true;
+
+       if (last_key->ucp_enables != key->ucp_enables)
+               return true;
+
+       return false;
+}
+
+/* clears shader-key flags which don't apply to the given shader
+ * stage
+ */
+static inline void
+ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
+{
+       switch (type) {
+       case MESA_SHADER_FRAGMENT:
+               if (key->has_per_samp) {
+                       key->vsaturate_s = 0;
+                       key->vsaturate_t = 0;
+                       key->vsaturate_r = 0;
+                       key->vastc_srgb = 0;
+                       key->vsamples = 0;
+               }
+               break;
+       case MESA_SHADER_VERTEX:
+               key->color_two_side = false;
+               key->half_precision = false;
+               key->rasterflat = false;
+               if (key->has_per_samp) {
+                       key->fsaturate_s = 0;
+                       key->fsaturate_t = 0;
+                       key->fsaturate_r = 0;
+                       key->fastc_srgb = 0;
+                       key->fsamples = 0;
+               }
+               break;
+       default:
+               /* TODO */
+               break;
+       }
+
+}
+
+struct ir3_shader_variant {
+       struct fd_bo *bo;
+
+       /* variant id (for debug) */
+       uint32_t id;
+
+       struct ir3_shader_key key;
+
+       /* vertex shaders can have an extra version for hwbinning pass,
+        * which is pointed to by so->binning:
+        */
+       bool binning_pass;
+       struct ir3_shader_variant *binning;
+
+       struct ir3_driver_const_layout const_layout;
+       struct ir3_info info;
+       struct ir3 *ir;
+
+       /* the instructions length is in units of instruction groups
+        * (4 instructions for a3xx, 16 instructions for a4xx.. each
+        * instruction is 2 dwords):
+        */
+       unsigned instrlen;
+
+       /* the constants length is in units of vec4's, and is the sum of
+        * the uniforms and the built-in compiler constants
+        */
+       unsigned constlen;
+
+       /* number of uniforms (in vec4), not including built-in compiler
+        * constants, etc.
+        */
+       unsigned num_uniforms;
+
+       unsigned num_ubos;
+
+       /* About Linkage:
+        *   + Let the frag shader determine the position/compmask for the
+        *     varyings, since it is the place where we know if the varying
+        *     is actually used, and if so, which components are used.  So
+        *     what the hw calls "outloc" is taken from the "inloc" of the
+        *     frag shader.
+        *   + From the vert shader, we only need the output regid
+        */
+
+       bool frag_coord, frag_face, color0_mrt;
+
+       /* NOTE: for input/outputs, slot is:
+        *   gl_vert_attrib  - for VS inputs
+        *   gl_varying_slot - for VS output / FS input
+        *   gl_frag_result  - for FS output
+        */
+
+       /* varyings/outputs: */
+       unsigned outputs_count;
+       struct {
+               uint8_t slot;
+               uint8_t regid;
+       } outputs[16 + 2];  /* +POSITION +PSIZE */
+       bool writes_pos, writes_psize;
+
+       /* attributes (VS) / varyings (FS):
+        * Note that sysval's should come *after* normal inputs.
+        */
+       unsigned inputs_count;
+       struct {
+               uint8_t slot;
+               uint8_t regid;
+               uint8_t compmask;
+               uint8_t ncomp;
+               /* location of input (ie. offset passed to bary.f, etc).  This
+                * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
+                * have the OUTLOCn value offset by 8, presumably to account
+                * for gl_Position/gl_PointSize)
+                */
+               uint8_t inloc;
+               /* vertex shader specific: */
+               bool    sysval     : 1;   /* slot is a gl_system_value */
+               /* fragment shader specific: */
+               bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
+               bool    rasterflat : 1;   /* special handling for emit->rasterflat */
+               enum glsl_interp_mode interpolate;
+       } inputs[16 + 2];  /* +POSITION +FACE */
+
+       /* sum of input components (scalar).  For frag shaders, it only counts
+        * the varying inputs:
+        */
+       unsigned total_in;
+
+       /* For frag shaders, the total number of inputs (not scalar,
+        * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+        */
+       unsigned varying_in;
+
+       /* number of samplers/textures (which are currently 1:1): */
+       int num_samp;
+
+       /* do we have one or more SSBO instructions: */
+       bool has_ssbo;
+
+       /* do we have kill instructions: */
+       bool has_kill;
+
+       /* Layout of constant registers, each section (in vec4). Pointer size
+        * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
+        * UBO and stream-out consts.
+        */
+       struct {
+               /* user const start at zero */
+               unsigned ubo;
+               /* NOTE that a3xx might need a section for SSBO addresses too */
+               unsigned ssbo_sizes;
+               unsigned image_dims;
+               unsigned driver_param;
+               unsigned tfbo;
+               unsigned immediate;
+       } constbase;
+
+       unsigned immediates_count;
+       unsigned immediates_size;
+       struct {
+               uint32_t val[4];
+       } *immediates;
+
+       /* for astc srgb workaround, the number/base of additional
+        * alpha tex states we need, and index of original tex states
+        */
+       struct {
+               unsigned base, count;
+               unsigned orig_idx[16];
+       } astc_srgb;
+
+       /* shader variants form a linked list: */
+       struct ir3_shader_variant *next;
+
+       /* replicated here to avoid passing extra ptrs everywhere: */
+       gl_shader_stage type;
+       struct ir3_shader *shader;
+};
+
+struct ir3_shader {
+       gl_shader_stage type;
+
+       /* shader id (for debug): */
+       uint32_t id;
+       uint32_t variant_count;
+
+       /* so we know when we can disable TGSI related hacks: */
+       bool from_tgsi;
+
+       struct ir3_compiler *compiler;
+
+       struct nir_shader *nir;
+       struct ir3_stream_output_info stream_output;
+
+       struct ir3_shader_variant *variants;
+};
+
+void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
+struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
+               struct ir3_shader_key *key, bool binning_pass, bool *created);
+struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
+void ir3_shader_destroy(struct ir3_shader *shader);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
+uint64_t ir3_shader_outputs(const struct ir3_shader *so);
+
+int
+ir3_glsl_type_size(const struct glsl_type *type);
+
+static inline const char *
+ir3_shader_stage(struct ir3_shader *shader)
+{
+       switch (shader->type) {
+       case MESA_SHADER_VERTEX:     return "VERT";
+       case MESA_SHADER_FRAGMENT:   return "FRAG";
+       case MESA_SHADER_COMPUTE:    return "CL";
+       default:
+               unreachable("invalid type");
+               return NULL;
+       }
+}
+
+/*
+ * Helper/util:
+ */
+
+static inline int
+ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
+{
+       int j;
+
+       for (j = 0; j < so->outputs_count; j++)
+               if (so->outputs[j].slot == slot)
+                       return j;
+
+       /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
+        * in the vertex shader.. but the fragment shader doesn't know this
+        * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
+        * at link time if there is no matching OUT.BCOLOR[n], we must map
+        * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
+        * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
+        */
+       if (slot == VARYING_SLOT_BFC0) {
+               slot = VARYING_SLOT_COL0;
+       } else if (slot == VARYING_SLOT_BFC1) {
+               slot = VARYING_SLOT_COL1;
+       } else if (slot == VARYING_SLOT_COL0) {
+               slot = VARYING_SLOT_BFC0;
+       } else if (slot == VARYING_SLOT_COL1) {
+               slot = VARYING_SLOT_BFC1;
+       } else {
+               return 0;
+       }
+
+       for (j = 0; j < so->outputs_count; j++)
+               if (so->outputs[j].slot == slot)
+                       return j;
+
+       debug_assert(0);
+
+       return 0;
+}
+
+static inline int
+ir3_next_varying(const struct ir3_shader_variant *so, int i)
+{
+       while (++i < so->inputs_count)
+               if (so->inputs[i].compmask && so->inputs[i].bary)
+                       break;
+       return i;
+}
+
+struct ir3_shader_linkage {
+       uint8_t max_loc;
+       uint8_t cnt;
+       struct {
+               uint8_t regid;
+               uint8_t compmask;
+               uint8_t loc;
+       } var[32];
+};
+
+static inline void
+ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
+{
+       int i = l->cnt++;
+
+       debug_assert(i < ARRAY_SIZE(l->var));
+
+       l->var[i].regid    = regid;
+       l->var[i].compmask = compmask;
+       l->var[i].loc      = loc;
+       l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
+}
+
+static inline void
+ir3_link_shaders(struct ir3_shader_linkage *l,
+               const struct ir3_shader_variant *vs,
+               const struct ir3_shader_variant *fs)
+{
+       int j = -1, k;
+
+       while (l->cnt < ARRAY_SIZE(l->var)) {
+               j = ir3_next_varying(fs, j);
+
+               if (j >= fs->inputs_count)
+                       break;
+
+               if (fs->inputs[j].inloc >= fs->total_in)
+                       continue;
+
+               k = ir3_find_output(vs, fs->inputs[j].slot);
+
+               ir3_link_add(l, vs->outputs[k].regid,
+                       fs->inputs[j].compmask, fs->inputs[j].inloc);
+       }
+}
+
+static inline uint32_t
+ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+       int j;
+       for (j = 0; j < so->outputs_count; j++)
+               if (so->outputs[j].slot == slot)
+                       return so->outputs[j].regid;
+       return regid(63, 0);
+}
+
+static inline uint32_t
+ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
+{
+       int j;
+       for (j = 0; j < so->inputs_count; j++)
+               if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
+                       return so->inputs[j].regid;
+       return regid(63, 0);
+}
+
+/* calculate register footprint in terms of half-regs (ie. one full
+ * reg counts as two half-regs).
+ */
+static inline uint32_t
+ir3_shader_halfregs(const struct ir3_shader_variant *v)
+{
+       return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+}
+
+#endif /* IR3_SHADER_H_ */
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build

new file mode 100644 (file)

index 0000000..07319df
--- /dev/null
+++ b/src/freedreno/ir3/meson.build
@@ -0,0 +1,64 @@
+# Copyright © 2018 Rob Clark
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+ir3_nir_trig_c = custom_target(
+  'ir3_nir_trig.c',
+  input : 'ir3_nir_trig.py',
+  output : 'ir3_nir_trig.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+  ],
+  capture : true,
+  depend_files : nir_algebraic_py,
+)
+
+libfreedreno_ir3_files = files(
+  'disasm-a3xx.c',
+  'instr-a3xx.h',
+  'ir3.c',
+  'ir3_compiler_nir.c',
+  'ir3_compiler.c',
+  'ir3_compiler.h',
+  'ir3_cp.c',
+  'ir3_depth.c',
+  'ir3_group.c',
+  'ir3.h',
+  'ir3_legalize.c',
+  'ir3_nir.c',
+  'ir3_nir.h',
+  'ir3_nir_lower_tg4_to_tex.c',
+  'ir3_print.c',
+  'ir3_ra.c',
+  'ir3_sched.c',
+  'ir3_shader.c',
+  'ir3_shader.h',
+)
+
+libfreedreno_ir3 = static_library(
+  'freedreno_ir3',
+  [libfreedreno_ir3_files, ir3_nir_trig_c],
+  include_directories : [inc_freedreno, inc_common],
+  c_args : [c_vis_args, no_override_init_args],
+  cpp_args : [cpp_vis_args],
+  dependencies : idep_nir_headers,
+  build_by_default : false,
+)
+
diff --git a/src/freedreno/meson.build b/src/freedreno/meson.build

index bb2cb201c0deb5216021668a73237367952b68a7..26ee6213890e9851c4726dd02c4c2579401ad483 100644 (file)
--- a/src/freedreno/meson.build
+++ b/src/freedreno/meson.build
@@ -21,3 +21,4 @@
  inc_freedreno = include_directories('.')
  
  subdir('drm')
+subdir('ir3')
diff --git a/src/gallium/drivers/freedreno/Automake.inc b/src/gallium/drivers/freedreno/Automake.inc

index 9b9b3d39fea1a324f2f636ea02318b9f7b44a3b2..936c286f4c92a50ec8b4b56eba893d98576f8ce8 100644 (file)
--- a/src/gallium/drivers/freedreno/Automake.inc
+++ b/src/gallium/drivers/freedreno/Automake.inc
@@ -6,6 +6,7 @@ TARGET_LIB_DEPS += \
         $(top_builddir)/src/gallium/winsys/freedreno/drm/libfreedrenodrm.la \
         $(top_builddir)/src/gallium/drivers/freedreno/libfreedreno.la \
         $(top_builddir)/src/freedreno/libfreedreno_drm.la \
+       $(top_builddir)/src/freedreno/libfreedreno_ir3.la \
         $(FREEDRENO_LIBS) \
         $(LIBDRM_LIBS)
  
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am

index 39887e13e379ca3e5b8d3079490d7a684208edad..32130ab94c53b16e0a5e0df06ac344f67c01ac25 100644 (file)
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -9,11 +9,6 @@ AM_CFLAGS = \
         -I$(top_srcdir)/src/compiler/nir \
         $(GALLIUM_DRIVER_CFLAGS)
  
-MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
-ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
-       $(MKDIR_GEN)
-       $(AM_V_GEN) $(PYTHON) $(PYTHON_FLAGS) $(srcdir)/ir3/ir3_nir_trig.py -p $(top_srcdir)/src/compiler/nir > $@ || ($(RM) $@; false)
-
  noinst_LTLIBRARIES = libfreedreno.la
  
  libfreedreno_la_SOURCES = \
@@ -23,28 +18,6 @@ libfreedreno_la_SOURCES = \
         $(a4xx_SOURCES) \
         $(a5xx_SOURCES) \
         $(a6xx_SOURCES) \
-       $(ir3_SOURCES) \
-       $(ir3_GENERATED_FILES)
-
-BUILT_SOURCES := $(ir3_GENERATED_FILES)
-CLEANFILES := $(BUILT_SOURCES)
-EXTRA_DIST = ir3/ir3_nir_trig.py
-
-noinst_PROGRAMS = ir3_compiler
-
-# XXX: Required due to the C++ sources in libnir
-nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp
-ir3_compiler_SOURCES = \
-       ir3/ir3_cmdline.c
-
-ir3_compiler_LDADD = \
-       libfreedreno.la \
-       $(top_builddir)/src/gallium/auxiliary/libgallium.la \
-       $(top_builddir)/src/compiler/nir/libnir.la \
-       $(top_builddir)/src/compiler/glsl/libstandalone.la \
-       $(top_builddir)/src/util/libmesautil.la \
-       $(top_builddir)/src/mesa/libmesagallium.la \
-       $(top_builddir)/src/freedreno/libfreedreno_drm.la \
-       $(GALLIUM_COMMON_LIB_DEPS)
+       $(ir3_SOURCES)
  
-EXTRA_DIST += meson.build
+EXTRA_DIST = meson.build
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources

index bde217d80a2b50071c76316935a3905ec2735755..039a8ca7af73b020dc49c18698461f6ea09dce0a 100644 (file)
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -195,29 +195,8 @@ a6xx_SOURCES := \
         a6xx/fd6_zsa.h
  
  ir3_SOURCES := \
-       ir3/disasm-a3xx.c \
-       ir3/instr-a3xx.h \
-       ir3/ir3.c \
         ir3/ir3_cache.c \
         ir3/ir3_cache.h \
-       ir3/ir3_compiler_nir.c \
-       ir3/ir3_compiler.c \
-       ir3/ir3_compiler.h \
-       ir3/ir3_cp.c \
-       ir3/ir3_depth.c \
         ir3/ir3_gallium.c \
-       ir3/ir3_gallium.h \
-       ir3/ir3_group.c \
-       ir3/ir3.h \
-       ir3/ir3_legalize.c \
-       ir3/ir3_nir.c \
-       ir3/ir3_nir.h \
-       ir3/ir3_nir_lower_tg4_to_tex.c \
-       ir3/ir3_print.c \
-       ir3/ir3_ra.c \
-       ir3/ir3_sched.c \
-       ir3/ir3_shader.c \
-       ir3/ir3_shader.h
+       ir3/ir3_gallium.h
  
-ir3_GENERATED_FILES := \
-       ir3/ir3_nir_trig.c
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h

index 4596aeee0255a949f5763b49c854c003a38f4ecb..0c9412a7501f25085b0f3d52744b4912e2617c27 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -31,7 +31,7 @@
  
  #include "freedreno_context.h"
  
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  
  struct fd3_context {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h

index 0551f1f8b914e4e7aa84a3455024849e5375e67d..533838a9a6d5e5d91d8f1fcd4902d4a871f38e94 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -29,7 +29,8 @@
  
  #include "pipe/p_context.h"
  #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
  
  struct fd3_emit;
  
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c

index a010a4df9a1161c991a0da833f7e00526bea2290..7ed57d2de5a852c9ed81560df77f580c90bdb30e 100644 (file)
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -30,7 +30,8 @@
  #include "fd3_screen.h"
  #include "fd3_context.h"
  #include "fd3_format.h"
-#include "ir3_compiler.h"
+
+#include "ir3/ir3_compiler.h"
  
  static boolean
  fd3_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h

index a4b84d400ef56c72a154276fa0e6f05471d1ed7c..a84e3a90f8309f783b56f5f5d8eb2ce2bc0cf0ea 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -31,7 +31,7 @@
  
  #include "freedreno_context.h"
  
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  struct fd4_context {
         struct fd_context base;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h

index cc98bc9a4d6bdfe983f59202950184c921349ea3..a0a0bec264feb12e73ebc9e949d2de3fba7d8d00 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -29,7 +29,8 @@
  
  #include "pipe/p_context.h"
  #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
  
  struct fd4_emit;
  
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c

index 4e4e274cd10f2851749e4bcf4c8d218b2bb1c9bd..961e907b77902ba28bc47c71776fa005f7a01bb8 100644 (file)
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -30,7 +30,8 @@
  #include "fd4_screen.h"
  #include "fd4_context.h"
  #include "fd4_format.h"
-#include "ir3_compiler.h"
+
+#include "ir3/ir3_compiler.h"
  
  static boolean
  fd4_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h

index 0cd252167b7fe3682c78c955f6669db65a992e3a..324878b4348626775c40723dc4a98f591d9f2258 100644 (file)
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
@@ -31,7 +31,7 @@
  
  #include "freedreno_context.h"
  
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  struct fd5_context {
         struct fd_context base;
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h

index 72cbf9a8b88defc15b7aad4de18026acc782c357..cdb31c62b634eee7d6844e3d8324d39e97d028a0 100644 (file)
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
@@ -29,7 +29,8 @@
  
  #include "pipe/p_context.h"
  #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
  
  struct fd5_emit;
  
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c

index 7d8d2b3e5b8dbb683f90bfd9f74260eb98648b6b..db9617908799486aa6828c7f71ab4c4f88323c71 100644 (file)
--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
@@ -33,7 +33,7 @@
  #include "fd5_format.h"
  #include "fd5_resource.h"
  
-#include "ir3_compiler.h"
+#include "ir3/ir3_compiler.h"
  
  static bool
  valid_sample_count(unsigned sample_count)
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h

index f3cdd44dec433f0b7b29a706cee463025c4f11e7..2493813fe1aceb04995aa68ec7e00201ed4bece8 100644 (file)
--- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h
@@ -32,7 +32,7 @@
  
  #include "freedreno_context.h"
  
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  #include "a6xx.xml.h"
  
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h

index 83c4688a243dded1a5a2239730fb552a9ea7be6b..3ed5426b50ee4424b60c67713d0567303a9478f1 100644 (file)
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h
@@ -30,7 +30,8 @@
  
  #include "pipe/p_context.h"
  #include "freedreno_context.h"
-#include "ir3_shader.h"
+
+#include "ir3/ir3_shader.h"
  #include "ir3_cache.h"
  
  struct fd6_streamout_state {
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c

index 9e039bf87a91a11ddf31558e630b0afcdda956ae..a191ea696ba1035016444a2067d46a0a5a99fefd 100644 (file)
--- a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
@@ -33,7 +33,7 @@
  #include "fd6_format.h"
  #include "fd6_resource.h"
  
-#include "ir3_compiler.h"
+#include "ir3/ir3_compiler.h"
  
  static boolean
  fd6_screen_is_format_supported(struct pipe_screen *pscreen,
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c

deleted file mode 100644 (file)

index 4cf45ce..0000000
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-#include <util/u_debug.h>
-
-#include "instr-a3xx.h"
-
-/* bitmask of debug flags */
-enum debug_t {
-       PRINT_RAW      = 0x1,    /* dump raw hexdump */
-       PRINT_VERBOSE  = 0x2,
-};
-
-static enum debug_t debug;
-
-#define printf debug_printf
-
-static const char *levels[] = {
-               "",
-               "\t",
-               "\t\t",
-               "\t\t\t",
-               "\t\t\t\t",
-               "\t\t\t\t\t",
-               "\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t",
-               "\t\t\t\t\t\t\t\t\t",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
-               "x",
-};
-
-static const char *component = "xyzw";
-
-static const char *type[] = {
-               [TYPE_F16] = "f16",
-               [TYPE_F32] = "f32",
-               [TYPE_U16] = "u16",
-               [TYPE_U32] = "u32",
-               [TYPE_S16] = "s16",
-               [TYPE_S32] = "s32",
-               [TYPE_U8]  = "u8",
-               [TYPE_S8]  = "s8",
-};
-
-struct disasm_ctx {
-       FILE *out;
-       int level;
-
-       /* current instruction repeat flag: */
-       unsigned repeat;
-};
-
-static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
-               bool c, bool im, bool neg, bool abs, bool addr_rel)
-{
-       const char type = c ? 'c' : 'r';
-
-       // XXX I prefer - and || for neg/abs, but preserving format used
-       // by libllvm-a3xx for easy diffing..
-
-       if (abs && neg)
-               fprintf(ctx->out, "(absneg)");
-       else if (neg)
-               fprintf(ctx->out, "(neg)");
-       else if (abs)
-               fprintf(ctx->out, "(abs)");
-
-       if (r)
-               fprintf(ctx->out, "(r)");
-
-       if (im) {
-               fprintf(ctx->out, "%d", reg.iim_val);
-       } else if (addr_rel) {
-               /* I would just use %+d but trying to make it diff'able with
-                * libllvm-a3xx...
-                */
-               if (reg.iim_val < 0)
-                       fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
-               else if (reg.iim_val > 0)
-                       fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
-               else
-                       fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
-       } else if ((reg.num == REG_A0) && !c) {
-               fprintf(ctx->out, "a0.%c", component[reg.comp]);
-       } else if ((reg.num == REG_P0) && !c) {
-               fprintf(ctx->out, "p0.%c", component[reg.comp]);
-       } else {
-               fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
-       }
-}
-
-
-static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
-{
-       print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
-}
-
-static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
-               bool c, bool im, bool neg, bool abs, bool addr_rel)
-{
-       print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
-}
-
-/* TODO switch to using reginfo struct everywhere, since more readable
- * than passing a bunch of bools to print_reg_src
- */
-
-struct reginfo {
-       reg_t reg;
-       bool full;
-       bool r;
-       bool c;
-       bool im;
-       bool neg;
-       bool abs;
-       bool addr_rel;
-};
-
-static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
-{
-       print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
-                       info->neg, info->abs, info->addr_rel);
-}
-
-//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
-//{
-//     print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
-//}
-
-static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat0_t *cat0 = &instr->cat0;
-
-       switch (cat0->opc) {
-       case OPC_KILL:
-               fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
-                               component[cat0->comp]);
-               break;
-       case OPC_BR:
-               fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
-                               component[cat0->comp], cat0->a3xx.immed);
-               break;
-       case OPC_JUMP:
-       case OPC_CALL:
-               fprintf(ctx->out, " #%d", cat0->a3xx.immed);
-               break;
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
-               fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
-}
-
-static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat1_t *cat1 = &instr->cat1;
-
-       if (cat1->ul)
-               fprintf(ctx->out, "(ul)");
-
-       if (cat1->src_type == cat1->dst_type) {
-               if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
-                       /* special case (nmemonic?): */
-                       fprintf(ctx->out, "mova");
-               } else {
-                       fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-               }
-       } else {
-               fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
-       }
-
-       fprintf(ctx->out, " ");
-
-       if (cat1->even)
-               fprintf(ctx->out, "(even)");
-
-       if (cat1->pos_inf)
-               fprintf(ctx->out, "(pos_infinity)");
-
-       print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
-                       cat1->dst_rel);
-
-       fprintf(ctx->out, ", ");
-
-       /* ugg, have to special case this.. vs print_reg().. */
-       if (cat1->src_im) {
-               if (type_float(cat1->src_type))
-                       fprintf(ctx->out, "(%f)", cat1->fim_val);
-               else if (type_uint(cat1->src_type))
-                       fprintf(ctx->out, "0x%08x", cat1->uim_val);
-               else
-                       fprintf(ctx->out, "%d", cat1->iim_val);
-       } else if (cat1->src_rel && !cat1->src_c) {
-               /* I would just use %+d but trying to make it diff'able with
-                * libllvm-a3xx...
-                */
-               char type = cat1->src_rel_c ? 'c' : 'r';
-               if (cat1->off < 0)
-                       fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
-               else if (cat1->off > 0)
-                       fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
-               else
-                       fprintf(ctx->out, "%c<a0.x>", type);
-       } else {
-               print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
-                               cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
-               fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
-}
-
-static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat2_t *cat2 = &instr->cat2;
-       static const char *cond[] = {
-                       "lt",
-                       "le",
-                       "gt",
-                       "ge",
-                       "eq",
-                       "ne",
-                       "?6?",
-       };
-
-       switch (_OPC(2, cat2->opc)) {
-       case OPC_CMPS_F:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_CMPV_F:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-               fprintf(ctx->out, ".%s", cond[cat2->cond]);
-               break;
-       }
-
-       fprintf(ctx->out, " ");
-       if (cat2->ei)
-               fprintf(ctx->out, "(ei)");
-       print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
-       fprintf(ctx->out, ", ");
-
-       if (cat2->c1.src1_c) {
-               print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
-                               cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, false);
-       } else if (cat2->rel1.src1_rel) {
-               print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
-                               cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, cat2->rel1.src1_rel);
-       } else {
-               print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
-                               false, cat2->src1_im, cat2->src1_neg,
-                               cat2->src1_abs, false);
-       }
-
-       switch (_OPC(2, cat2->opc)) {
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-       case OPC_CLZ_B:
-       case OPC_CLZ_S:
-       case OPC_SIGN_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_NOT_B:
-       case OPC_BFREV_B:
-       case OPC_SETRM:
-       case OPC_CBITS_B:
-               /* these only have one src reg */
-               break;
-       default:
-               fprintf(ctx->out, ", ");
-               if (cat2->c2.src2_c) {
-                       print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
-                                       cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, false);
-               } else if (cat2->rel2.src2_rel) {
-                       print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
-                                       cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, cat2->rel2.src2_rel);
-               } else {
-                       print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
-                                       false, cat2->src2_im, cat2->src2_neg,
-                                       cat2->src2_abs, false);
-               }
-               break;
-       }
-}
-
-static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat3_t *cat3 = &instr->cat3;
-       bool full = instr_cat3_full(cat3);
-
-       fprintf(ctx->out, " ");
-       print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
-       fprintf(ctx->out, ", ");
-       if (cat3->c1.src1_c) {
-               print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
-                               cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
-                               false, false);
-       } else if (cat3->rel1.src1_rel) {
-               print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
-                               cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
-                               false, cat3->rel1.src1_rel);
-       } else {
-               print_reg_src(ctx, (reg_t)(cat3->src1), full,
-                               cat3->src1_r, false, false, cat3->src1_neg,
-                               false, false);
-       }
-       fprintf(ctx->out, ", ");
-       print_reg_src(ctx, (reg_t)cat3->src2, full,
-                       cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
-                       false, false);
-       fprintf(ctx->out, ", ");
-       if (cat3->c2.src3_c) {
-               print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
-                               cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
-                               false, false);
-       } else if (cat3->rel2.src3_rel) {
-               print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
-                               cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
-                               false, cat3->rel2.src3_rel);
-       } else {
-               print_reg_src(ctx, (reg_t)(cat3->src3), full,
-                               cat3->src3_r, false, false, cat3->src3_neg,
-                               false, false);
-       }
-}
-
-static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat4_t *cat4 = &instr->cat4;
-
-       fprintf(ctx->out, " ");
-       print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
-       fprintf(ctx->out, ", ");
-
-       if (cat4->c.src_c) {
-               print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
-                               cat4->src_r, cat4->c.src_c, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, false);
-       } else if (cat4->rel.src_rel) {
-               print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
-                               cat4->src_r, cat4->rel.src_c, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
-       } else {
-               print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
-                               cat4->src_r, false, cat4->src_im,
-                               cat4->src_neg, cat4->src_abs, false);
-       }
-
-       if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
-               fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
-}
-
-static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
-{
-       static const struct {
-               bool src1, src2, samp, tex;
-       } info[0x1f] = {
-                       [opc_op(OPC_ISAM)]     = { true,  false, true,  true,  },
-                       [opc_op(OPC_ISAML)]    = { true,  true,  true,  true,  },
-                       [opc_op(OPC_ISAMM)]    = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAM)]      = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAMB)]     = { true,  true,  true,  true,  },
-                       [opc_op(OPC_SAML)]     = { true,  true,  true,  true,  },
-                       [opc_op(OPC_SAMGQ)]    = { true,  false, true,  true,  },
-                       [opc_op(OPC_GETLOD)]   = { true,  false, true,  true,  },
-                       [opc_op(OPC_CONV)]     = { true,  true,  true,  true,  },
-                       [opc_op(OPC_CONVM)]    = { true,  true,  true,  true,  },
-                       [opc_op(OPC_GETSIZE)]  = { true,  false, false, true,  },
-                       [opc_op(OPC_GETBUF)]   = { false, false, false, true,  },
-                       [opc_op(OPC_GETPOS)]   = { true,  false, false, true,  },
-                       [opc_op(OPC_GETINFO)]  = { false, false, false, true,  },
-                       [opc_op(OPC_DSX)]      = { true,  false, false, false, },
-                       [opc_op(OPC_DSY)]      = { true,  false, false, false, },
-                       [opc_op(OPC_GATHER4R)] = { true,  false, true,  true,  },
-                       [opc_op(OPC_GATHER4G)] = { true,  false, true,  true,  },
-                       [opc_op(OPC_GATHER4B)] = { true,  false, true,  true,  },
-                       [opc_op(OPC_GATHER4A)] = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAMGP0)]   = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAMGP1)]   = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAMGP2)]   = { true,  false, true,  true,  },
-                       [opc_op(OPC_SAMGP3)]   = { true,  false, true,  true,  },
-                       [opc_op(OPC_DSXPP_1)]  = { true,  false, false, false, },
-                       [opc_op(OPC_DSYPP_1)]  = { true,  false, false, false, },
-                       [opc_op(OPC_RGETPOS)]  = { false, false, false, false, },
-                       [opc_op(OPC_RGETINFO)] = { false, false, false, false, },
-       };
-       instr_cat5_t *cat5 = &instr->cat5;
-       int i;
-
-       if (cat5->is_3d)   fprintf(ctx->out, ".3d");
-       if (cat5->is_a)    fprintf(ctx->out, ".a");
-       if (cat5->is_o)    fprintf(ctx->out, ".o");
-       if (cat5->is_p)    fprintf(ctx->out, ".p");
-       if (cat5->is_s)    fprintf(ctx->out, ".s");
-       if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
-
-       fprintf(ctx->out, " ");
-
-       switch (_OPC(5, cat5->opc)) {
-       case OPC_DSXPP_1:
-       case OPC_DSYPP_1:
-               break;
-       default:
-               fprintf(ctx->out, "(%s)", type[cat5->type]);
-               break;
-       }
-
-       fprintf(ctx->out, "(");
-       for (i = 0; i < 4; i++)
-               if (cat5->wrmask & (1 << i))
-                       fprintf(ctx->out, "%c", "xyzw"[i]);
-       fprintf(ctx->out, ")");
-
-       print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
-
-       if (info[cat5->opc].src1) {
-               fprintf(ctx->out, ", ");
-               print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
-                               false, false, false);
-       }
-
-       if (cat5->is_s2en) {
-               fprintf(ctx->out, ", ");
-               print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
-                               false, false, false);
-               fprintf(ctx->out, ", ");
-               print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
-                               false, false, false);
-       } else {
-               if (cat5->is_o || info[cat5->opc].src2) {
-                       fprintf(ctx->out, ", ");
-                       print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
-                                       false, false, false, false, false, false);
-               }
-               if (info[cat5->opc].samp)
-                       fprintf(ctx->out, ", s#%d", cat5->norm.samp);
-               if (info[cat5->opc].tex)
-                       fprintf(ctx->out, ", t#%d", cat5->norm.tex);
-       }
-
-       if (debug & PRINT_VERBOSE) {
-               if (cat5->is_s2en) {
-                       if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
-                               fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
-               } else {
-                       if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
-                               fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
-               }
-       }
-}
-
-static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat6_t *cat6 = &instr->cat6;
-       char sd = 0, ss = 0;  /* dst/src address space */
-       bool nodst = false;
-       struct reginfo dst, src1, src2;
-       int src1off = 0, dstoff = 0;
-
-       memset(&dst, 0, sizeof(dst));
-       memset(&src1, 0, sizeof(src1));
-       memset(&src2, 0, sizeof(src2));
-
-       switch (_OPC(6, cat6->opc)) {
-       case OPC_RESINFO:
-       case OPC_RESFMT:
-               dst.full  = type_size(cat6->type) == 32;
-               src1.full = type_size(cat6->type) == 32;
-               src2.full = type_size(cat6->type) == 32;
-               break;
-       case OPC_L2G:
-       case OPC_G2L:
-               dst.full = true;
-               src1.full = true;
-               src2.full = true;
-               break;
-       case OPC_STG:
-       case OPC_STL:
-       case OPC_STP:
-       case OPC_STI:
-       case OPC_STLW:
-       case OPC_STIB:
-               dst.full  = true;
-               src1.full = type_size(cat6->type) == 32;
-               src2.full = type_size(cat6->type) == 32;
-               break;
-       default:
-               dst.full  = type_size(cat6->type) == 32;
-               src1.full = true;
-               src2.full = true;
-               break;
-       }
-
-       switch (_OPC(6, cat6->opc)) {
-       case OPC_PREFETCH:
-               break;
-       case OPC_RESINFO:
-               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-               break;
-       case OPC_LDGB:
-               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-               fprintf(ctx->out, ".%s", type[cat6->type]);
-               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-               break;
-       case OPC_STGB:
-       case OPC_STIB:
-               fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
-               fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
-               fprintf(ctx->out, ".%s", type[cat6->type]);
-               fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
-               break;
-       case OPC_ATOMIC_ADD:
-       case OPC_ATOMIC_SUB:
-       case OPC_ATOMIC_XCHG:
-       case OPC_ATOMIC_INC:
-       case OPC_ATOMIC_DEC:
-       case OPC_ATOMIC_CMPXCHG:
-       case OPC_ATOMIC_MIN:
-       case OPC_ATOMIC_MAX:
-       case OPC_ATOMIC_AND:
-       case OPC_ATOMIC_OR:
-       case OPC_ATOMIC_XOR:
-               ss = cat6->g ? 'g' : 'l';
-               fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
-               fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
-               fprintf(ctx->out, ".%s", type[cat6->type]);
-               fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
-               fprintf(ctx->out, ".%c", ss);
-               break;
-       default:
-               dst.im = cat6->g && !cat6->dst_off;
-               fprintf(ctx->out, ".%s", type[cat6->type]);
-               break;
-       }
-       fprintf(ctx->out, " ");
-
-       switch (_OPC(6, cat6->opc)) {
-       case OPC_STG:
-               sd = 'g';
-               break;
-       case OPC_STP:
-               sd = 'p';
-               break;
-       case OPC_STL:
-       case OPC_STLW:
-               sd = 'l';
-               break;
-
-       case OPC_LDG:
-       case OPC_LDC:
-               ss = 'g';
-               break;
-       case OPC_LDP:
-               ss = 'p';
-               break;
-       case OPC_LDL:
-       case OPC_LDLW:
-       case OPC_LDLV:
-               ss = 'l';
-               break;
-
-       case OPC_L2G:
-               ss = 'l';
-               sd = 'g';
-               break;
-
-       case OPC_G2L:
-               ss = 'g';
-               sd = 'l';
-               break;
-
-       case OPC_PREFETCH:
-               ss = 'g';
-               nodst = true;
-               break;
-
-       case OPC_STI:
-               dst.full = false;  // XXX or inverts??
-               break;
-       }
-
-       if ((_OPC(6, cat6->opc) == OPC_STGB) || (_OPC(6, cat6->opc) == OPC_STIB)) {
-               struct reginfo src3;
-
-               memset(&src3, 0, sizeof(src3));
-
-               src1.reg = (reg_t)(cat6->stgb.src1);
-               src2.reg = (reg_t)(cat6->stgb.src2);
-               src2.im  = cat6->stgb.src2_im;
-               src3.reg = (reg_t)(cat6->stgb.src3);
-               src3.im  = cat6->stgb.src3_im;
-               src3.full = true;
-
-               fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
-               print_src(ctx, &src1);
-               fprintf(ctx->out, ", ");
-               print_src(ctx, &src2);
-               fprintf(ctx->out, ", ");
-               print_src(ctx, &src3);
-
-               if (debug & PRINT_VERBOSE)
-                       fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
-
-               return;
-       }
-
-       if (is_atomic(_OPC(6, cat6->opc))) {
-
-               src1.reg = (reg_t)(cat6->ldgb.src1);
-               src1.im  = cat6->ldgb.src1_im;
-               src2.reg = (reg_t)(cat6->ldgb.src2);
-               src2.im  = cat6->ldgb.src2_im;
-               dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-               print_src(ctx, &dst);
-               fprintf(ctx->out, ", ");
-               if (ss == 'g') {
-                       struct reginfo src3;
-                       memset(&src3, 0, sizeof(src3));
-
-                       src3.reg = (reg_t)(cat6->ldgb.src3);
-                       src3.full = true;
-
-                       /* For images, the ".typed" variant is used and src2 is
-                        * the ivecN coordinates, ie ivec2 for 2d.
-                        *
-                        * For SSBOs, the ".untyped" variant is used and src2 is
-                        * a simple dword offset..  src3 appears to be
-                        * uvec2(offset * 4, 0).  Not sure the point of that.
-                        */
-
-                       fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-                       print_src(ctx, &src1);  /* value */
-                       fprintf(ctx->out, ", ");
-                       print_src(ctx, &src2);  /* offset/coords */
-                       fprintf(ctx->out, ", ");
-                       print_src(ctx, &src3);  /* 64b byte offset.. */
-
-                       if (debug & PRINT_VERBOSE) {
-                               fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
-                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-                       }
-               } else { /* ss == 'l' */
-                       fprintf(ctx->out, "l[");
-                       print_src(ctx, &src1);  /* simple byte offset */
-                       fprintf(ctx->out, "], ");
-                       print_src(ctx, &src2);  /* value */
-
-                       if (debug & PRINT_VERBOSE) {
-                               fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
-                                               cat6->ldgb.src3, cat6->ldgb.pad0,
-                                               cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-                       }
-               }
-
-               return;
-       } else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
-               dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-               print_src(ctx, &dst);
-               fprintf(ctx->out, ", ");
-               fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
-
-               return;
-       } else if (_OPC(6, cat6->opc) == OPC_LDGB) {
-
-               src1.reg = (reg_t)(cat6->ldgb.src1);
-               src1.im  = cat6->ldgb.src1_im;
-               src2.reg = (reg_t)(cat6->ldgb.src2);
-               src2.im  = cat6->ldgb.src2_im;
-               dst.reg  = (reg_t)(cat6->ldgb.dst);
-
-               print_src(ctx, &dst);
-               fprintf(ctx->out, ", ");
-               fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
-               print_src(ctx, &src1);
-               fprintf(ctx->out, ", ");
-               print_src(ctx, &src2);
-
-               if (debug & PRINT_VERBOSE)
-                       fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
-
-               return;
-       }
-       if (cat6->dst_off) {
-               dst.reg = (reg_t)(cat6->c.dst);
-               dstoff  = cat6->c.off;
-       } else {
-               dst.reg = (reg_t)(cat6->d.dst);
-       }
-
-       if (cat6->src_off) {
-               src1.reg = (reg_t)(cat6->a.src1);
-               src1.im  = cat6->a.src1_im;
-               src2.reg = (reg_t)(cat6->a.src2);
-               src2.im  = cat6->a.src2_im;
-               src1off  = cat6->a.off;
-       } else {
-               src1.reg = (reg_t)(cat6->b.src1);
-               src1.im  = cat6->b.src1_im;
-               src2.reg = (reg_t)(cat6->b.src2);
-               src2.im  = cat6->b.src2_im;
-       }
-
-       if (!nodst) {
-               if (sd)
-                       fprintf(ctx->out, "%c[", sd);
-               /* note: dst might actually be a src (ie. address to store to) */
-               print_src(ctx, &dst);
-               if (dstoff)
-                       fprintf(ctx->out, "%+d", dstoff);
-               if (sd)
-                       fprintf(ctx->out, "]");
-               fprintf(ctx->out, ", ");
-       }
-
-       if (ss)
-               fprintf(ctx->out, "%c[", ss);
-
-       /* can have a larger than normal immed, so hack: */
-       if (src1.im) {
-               fprintf(ctx->out, "%u", src1.reg.dummy13);
-       } else {
-               print_src(ctx, &src1);
-       }
-
-       if (src1off)
-               fprintf(ctx->out, "%+d", src1off);
-       if (ss)
-               fprintf(ctx->out, "]");
-
-       switch (_OPC(6, cat6->opc)) {
-       case OPC_RESINFO:
-       case OPC_RESFMT:
-               break;
-       default:
-               fprintf(ctx->out, ", ");
-               print_src(ctx, &src2);
-               break;
-       }
-}
-
-static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
-{
-       instr_cat7_t *cat7 = &instr->cat7;
-
-       if (cat7->g)
-               fprintf(ctx->out, ".g");
-       if (cat7->l)
-               fprintf(ctx->out, ".l");
-
-       if (_OPC(7, cat7->opc) == OPC_FENCE) {
-               if (cat7->r)
-                       fprintf(ctx->out, ".r");
-               if (cat7->w)
-                       fprintf(ctx->out, ".w");
-       }
-}
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-static const struct opc_info {
-       uint16_t cat;
-       uint16_t opc;
-       const char *name;
-       void (*print)(struct disasm_ctx *ctx, instr_t *instr);
-} opcs[1 << (3+NOPC_BITS)] = {
-#define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
-       /* category 0: */
-       OPC(0, OPC_NOP,          nop),
-       OPC(0, OPC_BR,           br),
-       OPC(0, OPC_JUMP,         jump),
-       OPC(0, OPC_CALL,         call),
-       OPC(0, OPC_RET,          ret),
-       OPC(0, OPC_KILL,         kill),
-       OPC(0, OPC_END,          end),
-       OPC(0, OPC_EMIT,         emit),
-       OPC(0, OPC_CUT,          cut),
-       OPC(0, OPC_CHMASK,       chmask),
-       OPC(0, OPC_CHSH,         chsh),
-       OPC(0, OPC_FLOW_REV,     flow_rev),
-
-       /* category 1: */
-       OPC(1, OPC_MOV, ),
-
-       /* category 2: */
-       OPC(2, OPC_ADD_F,        add.f),
-       OPC(2, OPC_MIN_F,        min.f),
-       OPC(2, OPC_MAX_F,        max.f),
-       OPC(2, OPC_MUL_F,        mul.f),
-       OPC(2, OPC_SIGN_F,       sign.f),
-       OPC(2, OPC_CMPS_F,       cmps.f),
-       OPC(2, OPC_ABSNEG_F,     absneg.f),
-       OPC(2, OPC_CMPV_F,       cmpv.f),
-       OPC(2, OPC_FLOOR_F,      floor.f),
-       OPC(2, OPC_CEIL_F,       ceil.f),
-       OPC(2, OPC_RNDNE_F,      rndne.f),
-       OPC(2, OPC_RNDAZ_F,      rndaz.f),
-       OPC(2, OPC_TRUNC_F,      trunc.f),
-       OPC(2, OPC_ADD_U,        add.u),
-       OPC(2, OPC_ADD_S,        add.s),
-       OPC(2, OPC_SUB_U,        sub.u),
-       OPC(2, OPC_SUB_S,        sub.s),
-       OPC(2, OPC_CMPS_U,       cmps.u),
-       OPC(2, OPC_CMPS_S,       cmps.s),
-       OPC(2, OPC_MIN_U,        min.u),
-       OPC(2, OPC_MIN_S,        min.s),
-       OPC(2, OPC_MAX_U,        max.u),
-       OPC(2, OPC_MAX_S,        max.s),
-       OPC(2, OPC_ABSNEG_S,     absneg.s),
-       OPC(2, OPC_AND_B,        and.b),
-       OPC(2, OPC_OR_B,         or.b),
-       OPC(2, OPC_NOT_B,        not.b),
-       OPC(2, OPC_XOR_B,        xor.b),
-       OPC(2, OPC_CMPV_U,       cmpv.u),
-       OPC(2, OPC_CMPV_S,       cmpv.s),
-       OPC(2, OPC_MUL_U,        mul.u),
-       OPC(2, OPC_MUL_S,        mul.s),
-       OPC(2, OPC_MULL_U,       mull.u),
-       OPC(2, OPC_BFREV_B,      bfrev.b),
-       OPC(2, OPC_CLZ_S,        clz.s),
-       OPC(2, OPC_CLZ_B,        clz.b),
-       OPC(2, OPC_SHL_B,        shl.b),
-       OPC(2, OPC_SHR_B,        shr.b),
-       OPC(2, OPC_ASHR_B,       ashr.b),
-       OPC(2, OPC_BARY_F,       bary.f),
-       OPC(2, OPC_MGEN_B,       mgen.b),
-       OPC(2, OPC_GETBIT_B,     getbit.b),
-       OPC(2, OPC_SETRM,        setrm),
-       OPC(2, OPC_CBITS_B,      cbits.b),
-       OPC(2, OPC_SHB,          shb),
-       OPC(2, OPC_MSAD,         msad),
-
-       /* category 3: */
-       OPC(3, OPC_MAD_U16,      mad.u16),
-       OPC(3, OPC_MADSH_U16,    madsh.u16),
-       OPC(3, OPC_MAD_S16,      mad.s16),
-       OPC(3, OPC_MADSH_M16,    madsh.m16),
-       OPC(3, OPC_MAD_U24,      mad.u24),
-       OPC(3, OPC_MAD_S24,      mad.s24),
-       OPC(3, OPC_MAD_F16,      mad.f16),
-       OPC(3, OPC_MAD_F32,      mad.f32),
-       OPC(3, OPC_SEL_B16,      sel.b16),
-       OPC(3, OPC_SEL_B32,      sel.b32),
-       OPC(3, OPC_SEL_S16,      sel.s16),
-       OPC(3, OPC_SEL_S32,      sel.s32),
-       OPC(3, OPC_SEL_F16,      sel.f16),
-       OPC(3, OPC_SEL_F32,      sel.f32),
-       OPC(3, OPC_SAD_S16,      sad.s16),
-       OPC(3, OPC_SAD_S32,      sad.s32),
-
-       /* category 4: */
-       OPC(4, OPC_RCP,          rcp),
-       OPC(4, OPC_RSQ,          rsq),
-       OPC(4, OPC_LOG2,         log2),
-       OPC(4, OPC_EXP2,         exp2),
-       OPC(4, OPC_SIN,          sin),
-       OPC(4, OPC_COS,          cos),
-       OPC(4, OPC_SQRT,         sqrt),
-
-       /* category 5: */
-       OPC(5, OPC_ISAM,         isam),
-       OPC(5, OPC_ISAML,        isaml),
-       OPC(5, OPC_ISAMM,        isamm),
-       OPC(5, OPC_SAM,          sam),
-       OPC(5, OPC_SAMB,         samb),
-       OPC(5, OPC_SAML,         saml),
-       OPC(5, OPC_SAMGQ,        samgq),
-       OPC(5, OPC_GETLOD,       getlod),
-       OPC(5, OPC_CONV,         conv),
-       OPC(5, OPC_CONVM,        convm),
-       OPC(5, OPC_GETSIZE,      getsize),
-       OPC(5, OPC_GETBUF,       getbuf),
-       OPC(5, OPC_GETPOS,       getpos),
-       OPC(5, OPC_GETINFO,      getinfo),
-       OPC(5, OPC_DSX,          dsx),
-       OPC(5, OPC_DSY,          dsy),
-       OPC(5, OPC_GATHER4R,     gather4r),
-       OPC(5, OPC_GATHER4G,     gather4g),
-       OPC(5, OPC_GATHER4B,     gather4b),
-       OPC(5, OPC_GATHER4A,     gather4a),
-       OPC(5, OPC_SAMGP0,       samgp0),
-       OPC(5, OPC_SAMGP1,       samgp1),
-       OPC(5, OPC_SAMGP2,       samgp2),
-       OPC(5, OPC_SAMGP3,       samgp3),
-       OPC(5, OPC_DSXPP_1,      dsxpp.1),
-       OPC(5, OPC_DSYPP_1,      dsypp.1),
-       OPC(5, OPC_RGETPOS,      rgetpos),
-       OPC(5, OPC_RGETINFO,     rgetinfo),
-
-
-       /* category 6: */
-       OPC(6, OPC_LDG,          ldg),
-       OPC(6, OPC_LDL,          ldl),
-       OPC(6, OPC_LDP,          ldp),
-       OPC(6, OPC_STG,          stg),
-       OPC(6, OPC_STL,          stl),
-       OPC(6, OPC_STP,          stp),
-       OPC(6, OPC_STI,          sti),
-       OPC(6, OPC_G2L,          g2l),
-       OPC(6, OPC_L2G,          l2g),
-       OPC(6, OPC_PREFETCH,     prefetch),
-       OPC(6, OPC_LDLW,         ldlw),
-       OPC(6, OPC_STLW,         stlw),
-       OPC(6, OPC_RESFMT,       resfmt),
-       OPC(6, OPC_RESINFO,      resinfo),
-       OPC(6, OPC_ATOMIC_ADD,     atomic.add),
-       OPC(6, OPC_ATOMIC_SUB,     atomic.sub),
-       OPC(6, OPC_ATOMIC_XCHG,    atomic.xchg),
-       OPC(6, OPC_ATOMIC_INC,     atomic.inc),
-       OPC(6, OPC_ATOMIC_DEC,     atomic.dec),
-       OPC(6, OPC_ATOMIC_CMPXCHG, atomic.cmpxchg),
-       OPC(6, OPC_ATOMIC_MIN,     atomic.min),
-       OPC(6, OPC_ATOMIC_MAX,     atomic.max),
-       OPC(6, OPC_ATOMIC_AND,     atomic.and),
-       OPC(6, OPC_ATOMIC_OR,      atomic.or),
-       OPC(6, OPC_ATOMIC_XOR,     atomic.xor),
-       OPC(6, OPC_LDGB,         ldgb),
-       OPC(6, OPC_STGB,         stgb),
-       OPC(6, OPC_STIB,         stib),
-       OPC(6, OPC_LDC,          ldc),
-       OPC(6, OPC_LDLV,         ldlv),
-
-       OPC(7, OPC_BAR,          bar),
-       OPC(7, OPC_FENCE,        fence),
-
-#undef OPC
-};
-
-#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)]))
-
-// XXX hack.. probably should move this table somewhere common:
-#include "ir3.h"
-const char *ir3_instr_name(struct ir3_instruction *instr)
-{
-       if (opc_cat(instr->opc) == -1) return "??meta??";
-       return opcs[instr->opc].name;
-}
-
-static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
-{
-       instr_t *instr = (instr_t *)dwords;
-       uint32_t opc = instr_opc(instr);
-       const char *name;
-
-       if (debug & PRINT_VERBOSE)
-               fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
-
-       /* NOTE: order flags are printed is a bit fugly.. but for now I
-        * try to match the order in llvm-a3xx disassembler for easy
-        * diff'ing..
-        */
-
-       ctx->repeat = instr_repeat(instr);
-
-       if (instr->sync)
-               fprintf(ctx->out, "(sy)");
-       if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
-               fprintf(ctx->out, "(ss)");
-       if (instr->jmp_tgt)
-               fprintf(ctx->out, "(jp)");
-       if (instr_sat(instr))
-               fprintf(ctx->out, "(sat)");
-       if (ctx->repeat)
-               fprintf(ctx->out, "(rpt%d)", ctx->repeat);
-       if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
-               fprintf(ctx->out, "(ul)");
-
-       name = GETINFO(instr)->name;
-
-       if (name) {
-               fprintf(ctx->out, "%s", name);
-               GETINFO(instr)->print(ctx, instr);
-       } else {
-               fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
-       }
-
-       fprintf(ctx->out, "\n");
-
-       return (instr->opc_cat == 0) && (opc == OPC_END);
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
-{
-       struct disasm_ctx ctx;
-       int i;
-
-       assert((sizedwords % 2) == 0);
-
-       memset(&ctx, 0, sizeof(ctx));
-       ctx.out = out;
-       ctx.level = level;
-
-       for (i = 0; i < sizedwords; i += 2)
-               print_instr(&ctx, &dwords[i], i/2);
-
-       return 0;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h

deleted file mode 100644 (file)

index 7f60ee5..0000000
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ /dev/null
@@ -1,872 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef INSTR_A3XX_H_
-#define INSTR_A3XX_H_
-
-#define PACKED __attribute__((__packed__))
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <assert.h>
-
-/* size of largest OPC field of all the instruction categories: */
-#define NOPC_BITS 6
-
-#define _OPC(cat, opc)   (((cat) << NOPC_BITS) | opc)
-
-typedef enum {
-       /* category 0: */
-       OPC_NOP             = _OPC(0, 0),
-       OPC_BR              = _OPC(0, 1),
-       OPC_JUMP            = _OPC(0, 2),
-       OPC_CALL            = _OPC(0, 3),
-       OPC_RET             = _OPC(0, 4),
-       OPC_KILL            = _OPC(0, 5),
-       OPC_END             = _OPC(0, 6),
-       OPC_EMIT            = _OPC(0, 7),
-       OPC_CUT             = _OPC(0, 8),
-       OPC_CHMASK          = _OPC(0, 9),
-       OPC_CHSH            = _OPC(0, 10),
-       OPC_FLOW_REV        = _OPC(0, 11),
-
-       /* category 1: */
-       OPC_MOV             = _OPC(1, 0),
-
-       /* category 2: */
-       OPC_ADD_F           = _OPC(2, 0),
-       OPC_MIN_F           = _OPC(2, 1),
-       OPC_MAX_F           = _OPC(2, 2),
-       OPC_MUL_F           = _OPC(2, 3),
-       OPC_SIGN_F          = _OPC(2, 4),
-       OPC_CMPS_F          = _OPC(2, 5),
-       OPC_ABSNEG_F        = _OPC(2, 6),
-       OPC_CMPV_F          = _OPC(2, 7),
-       /* 8 - invalid */
-       OPC_FLOOR_F         = _OPC(2, 9),
-       OPC_CEIL_F          = _OPC(2, 10),
-       OPC_RNDNE_F         = _OPC(2, 11),
-       OPC_RNDAZ_F         = _OPC(2, 12),
-       OPC_TRUNC_F         = _OPC(2, 13),
-       /* 14-15 - invalid */
-       OPC_ADD_U           = _OPC(2, 16),
-       OPC_ADD_S           = _OPC(2, 17),
-       OPC_SUB_U           = _OPC(2, 18),
-       OPC_SUB_S           = _OPC(2, 19),
-       OPC_CMPS_U          = _OPC(2, 20),
-       OPC_CMPS_S          = _OPC(2, 21),
-       OPC_MIN_U           = _OPC(2, 22),
-       OPC_MIN_S           = _OPC(2, 23),
-       OPC_MAX_U           = _OPC(2, 24),
-       OPC_MAX_S           = _OPC(2, 25),
-       OPC_ABSNEG_S        = _OPC(2, 26),
-       /* 27 - invalid */
-       OPC_AND_B           = _OPC(2, 28),
-       OPC_OR_B            = _OPC(2, 29),
-       OPC_NOT_B           = _OPC(2, 30),
-       OPC_XOR_B           = _OPC(2, 31),
-       /* 32 - invalid */
-       OPC_CMPV_U          = _OPC(2, 33),
-       OPC_CMPV_S          = _OPC(2, 34),
-       /* 35-47 - invalid */
-       OPC_MUL_U           = _OPC(2, 48),
-       OPC_MUL_S           = _OPC(2, 49),
-       OPC_MULL_U          = _OPC(2, 50),
-       OPC_BFREV_B         = _OPC(2, 51),
-       OPC_CLZ_S           = _OPC(2, 52),
-       OPC_CLZ_B           = _OPC(2, 53),
-       OPC_SHL_B           = _OPC(2, 54),
-       OPC_SHR_B           = _OPC(2, 55),
-       OPC_ASHR_B          = _OPC(2, 56),
-       OPC_BARY_F          = _OPC(2, 57),
-       OPC_MGEN_B          = _OPC(2, 58),
-       OPC_GETBIT_B        = _OPC(2, 59),
-       OPC_SETRM           = _OPC(2, 60),
-       OPC_CBITS_B         = _OPC(2, 61),
-       OPC_SHB             = _OPC(2, 62),
-       OPC_MSAD            = _OPC(2, 63),
-
-       /* category 3: */
-       OPC_MAD_U16         = _OPC(3, 0),
-       OPC_MADSH_U16       = _OPC(3, 1),
-       OPC_MAD_S16         = _OPC(3, 2),
-       OPC_MADSH_M16       = _OPC(3, 3),   /* should this be .s16? */
-       OPC_MAD_U24         = _OPC(3, 4),
-       OPC_MAD_S24         = _OPC(3, 5),
-       OPC_MAD_F16         = _OPC(3, 6),
-       OPC_MAD_F32         = _OPC(3, 7),
-       OPC_SEL_B16         = _OPC(3, 8),
-       OPC_SEL_B32         = _OPC(3, 9),
-       OPC_SEL_S16         = _OPC(3, 10),
-       OPC_SEL_S32         = _OPC(3, 11),
-       OPC_SEL_F16         = _OPC(3, 12),
-       OPC_SEL_F32         = _OPC(3, 13),
-       OPC_SAD_S16         = _OPC(3, 14),
-       OPC_SAD_S32         = _OPC(3, 15),
-
-       /* category 4: */
-       OPC_RCP             = _OPC(4, 0),
-       OPC_RSQ             = _OPC(4, 1),
-       OPC_LOG2            = _OPC(4, 2),
-       OPC_EXP2            = _OPC(4, 3),
-       OPC_SIN             = _OPC(4, 4),
-       OPC_COS             = _OPC(4, 5),
-       OPC_SQRT            = _OPC(4, 6),
-       // 7-63 - invalid
-
-       /* category 5: */
-       OPC_ISAM            = _OPC(5, 0),
-       OPC_ISAML           = _OPC(5, 1),
-       OPC_ISAMM           = _OPC(5, 2),
-       OPC_SAM             = _OPC(5, 3),
-       OPC_SAMB            = _OPC(5, 4),
-       OPC_SAML            = _OPC(5, 5),
-       OPC_SAMGQ           = _OPC(5, 6),
-       OPC_GETLOD          = _OPC(5, 7),
-       OPC_CONV            = _OPC(5, 8),
-       OPC_CONVM           = _OPC(5, 9),
-       OPC_GETSIZE         = _OPC(5, 10),
-       OPC_GETBUF          = _OPC(5, 11),
-       OPC_GETPOS          = _OPC(5, 12),
-       OPC_GETINFO         = _OPC(5, 13),
-       OPC_DSX             = _OPC(5, 14),
-       OPC_DSY             = _OPC(5, 15),
-       OPC_GATHER4R        = _OPC(5, 16),
-       OPC_GATHER4G        = _OPC(5, 17),
-       OPC_GATHER4B        = _OPC(5, 18),
-       OPC_GATHER4A        = _OPC(5, 19),
-       OPC_SAMGP0          = _OPC(5, 20),
-       OPC_SAMGP1          = _OPC(5, 21),
-       OPC_SAMGP2          = _OPC(5, 22),
-       OPC_SAMGP3          = _OPC(5, 23),
-       OPC_DSXPP_1         = _OPC(5, 24),
-       OPC_DSYPP_1         = _OPC(5, 25),
-       OPC_RGETPOS         = _OPC(5, 26),
-       OPC_RGETINFO        = _OPC(5, 27),
-
-       /* category 6: */
-       OPC_LDG             = _OPC(6, 0),        /* load-global */
-       OPC_LDL             = _OPC(6, 1),
-       OPC_LDP             = _OPC(6, 2),
-       OPC_STG             = _OPC(6, 3),        /* store-global */
-       OPC_STL             = _OPC(6, 4),
-       OPC_STP             = _OPC(6, 5),
-       OPC_STI             = _OPC(6, 6),
-       OPC_G2L             = _OPC(6, 7),
-       OPC_L2G             = _OPC(6, 8),
-       OPC_PREFETCH        = _OPC(6, 9),
-       OPC_LDLW            = _OPC(6, 10),
-       OPC_STLW            = _OPC(6, 11),
-       OPC_RESFMT          = _OPC(6, 14),
-       OPC_RESINFO         = _OPC(6, 15),
-       OPC_ATOMIC_ADD      = _OPC(6, 16),
-       OPC_ATOMIC_SUB      = _OPC(6, 17),
-       OPC_ATOMIC_XCHG     = _OPC(6, 18),
-       OPC_ATOMIC_INC      = _OPC(6, 19),
-       OPC_ATOMIC_DEC      = _OPC(6, 20),
-       OPC_ATOMIC_CMPXCHG  = _OPC(6, 21),
-       OPC_ATOMIC_MIN      = _OPC(6, 22),
-       OPC_ATOMIC_MAX      = _OPC(6, 23),
-       OPC_ATOMIC_AND      = _OPC(6, 24),
-       OPC_ATOMIC_OR       = _OPC(6, 25),
-       OPC_ATOMIC_XOR      = _OPC(6, 26),
-       OPC_LDGB            = _OPC(6, 27),
-       OPC_STGB            = _OPC(6, 28),
-       OPC_STIB            = _OPC(6, 29),
-       OPC_LDC             = _OPC(6, 30),
-       OPC_LDLV            = _OPC(6, 31),
-
-       /* category 7: */
-       OPC_BAR             = _OPC(7, 0),
-       OPC_FENCE           = _OPC(7, 1),
-
-       /* meta instructions (category -1): */
-       /* placeholder instr to mark shader inputs: */
-       OPC_META_INPUT      = _OPC(-1, 0),
-       /* The "fan-in" and "fan-out" instructions are used for keeping
-        * track of instructions that write to multiple dst registers
-        * (fan-out) like texture sample instructions, or read multiple
-        * consecutive scalar registers (fan-in) (bary.f, texture samp)
-        */
-       OPC_META_FO         = _OPC(-1, 2),
-       OPC_META_FI         = _OPC(-1, 3),
-
-} opc_t;
-
-#define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
-#define opc_op(opc)  ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
-
-typedef enum {
-       TYPE_F16 = 0,
-       TYPE_F32 = 1,
-       TYPE_U16 = 2,
-       TYPE_U32 = 3,
-       TYPE_S16 = 4,
-       TYPE_S32 = 5,
-       TYPE_U8  = 6,
-       TYPE_S8  = 7,  // XXX I assume?
-} type_t;
-
-static inline uint32_t type_size(type_t type)
-{
-       switch (type) {
-       case TYPE_F32:
-       case TYPE_U32:
-       case TYPE_S32:
-               return 32;
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return 16;
-       case TYPE_U8:
-       case TYPE_S8:
-               return 8;
-       default:
-               assert(0); /* invalid type */
-               return 0;
-       }
-}
-
-static inline int type_float(type_t type)
-{
-       return (type == TYPE_F32) || (type == TYPE_F16);
-}
-
-static inline int type_uint(type_t type)
-{
-       return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
-}
-
-static inline int type_sint(type_t type)
-{
-       return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
-}
-
-typedef union PACKED {
-       /* normal gpr or const src register: */
-       struct PACKED {
-               uint32_t comp  : 2;
-               uint32_t num   : 10;
-       };
-       /* for immediate val: */
-       int32_t  iim_val   : 11;
-       /* to make compiler happy: */
-       uint32_t dummy32;
-       uint32_t dummy10   : 10;
-       int32_t  idummy10  : 10;
-       uint32_t dummy11   : 11;
-       uint32_t dummy12   : 12;
-       uint32_t dummy13   : 13;
-       uint32_t dummy8    : 8;
-} reg_t;
-
-/* special registers: */
-#define REG_A0 61       /* address register */
-#define REG_P0 62       /* predicate register */
-
-static inline int reg_special(reg_t reg)
-{
-       return (reg.num == REG_A0) || (reg.num == REG_P0);
-}
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       int16_t  immed    : 16;
-                       uint32_t dummy1   : 16;
-               } a3xx;
-               struct PACKED {
-                       int32_t  immed    : 20;
-                       uint32_t dummy1   : 12;
-               } a4xx;
-               struct PACKED {
-                       int32_t immed     : 32;
-               } a5xx;
-       };
-
-       /* dword1: */
-       uint32_t dummy2   : 8;
-       uint32_t repeat   : 3;
-       uint32_t dummy3   : 1;
-       uint32_t ss       : 1;
-       uint32_t dummy4   : 7;
-       uint32_t inv      : 1;
-       uint32_t comp     : 2;
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat0_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* for normal src register: */
-               struct PACKED {
-                       uint32_t src : 11;
-                       /* at least low bit of pad must be zero or it will
-                        * look like a address relative src
-                        */
-                       uint32_t pad : 21;
-               };
-               /* for address relative: */
-               struct PACKED {
-                       int32_t  off : 10;
-                       uint32_t src_rel_c : 1;
-                       uint32_t src_rel : 1;
-                       uint32_t unknown : 20;
-               };
-               /* for immediate: */
-               int32_t  iim_val;
-               uint32_t uim_val;
-               float    fim_val;
-       };
-
-       /* dword1: */
-       uint32_t dst        : 8;
-       uint32_t repeat     : 3;
-       uint32_t src_r      : 1;
-       uint32_t ss         : 1;
-       uint32_t ul         : 1;
-       uint32_t dst_type   : 3;
-       uint32_t dst_rel    : 1;
-       uint32_t src_type   : 3;
-       uint32_t src_c      : 1;
-       uint32_t src_im     : 1;
-       uint32_t even       : 1;
-       uint32_t pos_inf    : 1;
-       uint32_t must_be_0  : 2;
-       uint32_t jmp_tgt    : 1;
-       uint32_t sync       : 1;
-       uint32_t opc_cat    : 3;
-} instr_cat1_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src1_im      : 1;   /* immediate */
-                       uint32_t src1_neg     : 1;   /* negate */
-                       uint32_t src1_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;   /* relative-const */
-                       uint32_t src1_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src2         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src2_im      : 1;   /* immediate */
-                       uint32_t src2_neg     : 1;   /* negate */
-                       uint32_t src2_abs     : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src2         : 10;
-                       uint32_t src2_c       : 1;   /* relative-const */
-                       uint32_t src2_rel     : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src2         : 12;
-                       uint32_t src2_c       : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src1_r   : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;   /* dunno */
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t ei       : 1;
-       uint32_t cond     : 3;
-       uint32_t src2_r   : 1;
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat2_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src1         : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src2_c       : 1;
-                       uint32_t src1_neg     : 1;
-                       uint32_t src2_r       : 1;
-               };
-               struct PACKED {
-                       uint32_t src1         : 10;
-                       uint32_t src1_c       : 1;
-                       uint32_t src1_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel1;
-               struct PACKED {
-                       uint32_t src1         : 12;
-                       uint32_t src1_c       : 1;
-                       uint32_t dummy        : 3;
-               } c1;
-       };
-
-       union PACKED {
-               struct PACKED {
-                       uint32_t src3         : 11;
-                       uint32_t must_be_zero2: 2;
-                       uint32_t src3_r       : 1;
-                       uint32_t src2_neg     : 1;
-                       uint32_t src3_neg     : 1;
-               };
-               struct PACKED {
-                       uint32_t src3         : 10;
-                       uint32_t src3_c       : 1;
-                       uint32_t src3_rel     : 1;
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel2;
-               struct PACKED {
-                       uint32_t src3         : 12;
-                       uint32_t src3_c       : 1;
-                       uint32_t dummy        : 3;
-               } c2;
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src1_r   : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t src2     : 8;
-       uint32_t opc      : 4;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat3_t;
-
-static inline bool instr_cat3_full(instr_cat3_t *cat3)
-{
-       switch (_OPC(3, cat3->opc)) {
-       case OPC_MAD_F16:
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_SEL_B16:
-       case OPC_SEL_S16:
-       case OPC_SEL_F16:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:  // really??
-               return false;
-       default:
-               return true;
-       }
-}
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               struct PACKED {
-                       uint32_t src          : 11;
-                       uint32_t must_be_zero1: 2;
-                       uint32_t src_im       : 1;   /* immediate */
-                       uint32_t src_neg      : 1;   /* negate */
-                       uint32_t src_abs      : 1;   /* absolute value */
-               };
-               struct PACKED {
-                       uint32_t src          : 10;
-                       uint32_t src_c        : 1;   /* relative-const */
-                       uint32_t src_rel      : 1;   /* relative address */
-                       uint32_t must_be_zero : 1;
-                       uint32_t dummy        : 3;
-               } rel;
-               struct PACKED {
-                       uint32_t src          : 12;
-                       uint32_t src_c        : 1;   /* const */
-                       uint32_t dummy        : 3;
-               } c;
-       };
-       uint32_t dummy1   : 16;  /* seem to be ignored */
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t repeat   : 2;
-       uint32_t sat      : 1;
-       uint32_t src_r    : 1;
-       uint32_t ss       : 1;
-       uint32_t ul       : 1;
-       uint32_t dst_half : 1;   /* or widen/narrow.. ie. dst hrN <-> rN */
-       uint32_t dummy2   : 5;   /* seem to be ignored */
-       uint32_t full     : 1;   /* not half */
-       uint32_t opc      : 6;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat4_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       union PACKED {
-               /* normal case: */
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 8;
-                       uint32_t dummy1   : 4;   /* seem to be ignored */
-                       uint32_t samp     : 4;
-                       uint32_t tex      : 7;
-               } norm;
-               /* s2en case: */
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t src2     : 11;
-                       uint32_t dummy1   : 1;
-                       uint32_t src3     : 8;
-                       uint32_t dummy2   : 3;
-               } s2en;
-               /* same in either case: */
-               // XXX I think, confirm this
-               struct PACKED {
-                       uint32_t full     : 1;   /* not half */
-                       uint32_t src1     : 8;
-                       uint32_t pad      : 23;
-               };
-       };
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t wrmask   : 4;   /* write-mask */
-       uint32_t type     : 3;
-       uint32_t dummy2   : 1;   /* seems to be ignored */
-       uint32_t is_3d    : 1;
-
-       uint32_t is_a     : 1;
-       uint32_t is_s     : 1;
-       uint32_t is_s2en  : 1;
-       uint32_t is_o     : 1;
-       uint32_t is_p     : 1;
-
-       uint32_t opc      : 5;
-       uint32_t jmp_tgt  : 1;
-       uint32_t sync     : 1;
-       uint32_t opc_cat  : 3;
-} instr_cat5_t;
-
-/* dword0 encoding for src_off: [src1 + off], src2: */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe1  : 1;
-       int32_t  off      : 13;
-       uint32_t src1     : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t dword1;
-} instr_cat6a_t;
-
-/* dword0 encoding for !src_off: [src1], src2 */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe0  : 1;
-       uint32_t src1     : 13;
-       uint32_t ignore0  : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t dword1;
-} instr_cat6b_t;
-
-/* dword1 encoding for dst_off: */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t dword0;
-
-       /* note: there is some weird stuff going on where sometimes
-        * cat6->a.off is involved.. but that seems like a bug in
-        * the blob, since it is used even if !cat6->src_off
-        * It would make sense for there to be some more bits to
-        * bring us to 11 bits worth of offset, but not sure..
-        */
-       int32_t off       : 8;
-       uint32_t mustbe1  : 1;
-       uint32_t dst      : 8;
-       uint32_t pad1     : 15;
-} instr_cat6c_t;
-
-/* dword1 encoding for !dst_off: */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t dword0;
-
-       uint32_t dst      : 8;
-       uint32_t mustbe0  : 1;
-       uint32_t idx      : 8;
-       uint32_t pad0     : 15;
-} instr_cat6d_t;
-
-/* ldgb and atomics..
- *
- * ldgb:      pad0=0, pad3=1
- * atomic .g: pad0=1, pad3=1
- *        .l: pad0=1, pad3=0
- */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t pad0     : 1;
-       uint32_t src3     : 8;
-       uint32_t d        : 2;
-       uint32_t typed    : 1;
-       uint32_t type_size : 2;
-       uint32_t src1     : 8;
-       uint32_t src1_im  : 1;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t dst      : 8;
-       uint32_t mustbe0  : 1;
-       uint32_t src_ssbo : 8;
-       uint32_t pad2     : 3;  // type
-       uint32_t g        : 1;
-       uint32_t pad3     : 1;
-       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6ldgb_t;
-
-/* stgb, pad0=0, pad3=2
- */
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t mustbe1  : 1;  // ???
-       uint32_t src1     : 8;
-       uint32_t d        : 2;
-       uint32_t typed    : 1;
-       uint32_t type_size : 2;
-       uint32_t pad0     : 9;
-       uint32_t src2_im  : 1;
-       uint32_t src2     : 8;
-
-       /* dword1: */
-       uint32_t src3     : 8;
-       uint32_t src3_im  : 1;
-       uint32_t dst_ssbo : 8;
-       uint32_t pad2     : 3;  // type
-       uint32_t pad3     : 2;
-       uint32_t pad4     : 10; // opc/jmp_tgt/sync/opc_cat
-} instr_cat6stgb_t;
-
-typedef union PACKED {
-       instr_cat6a_t a;
-       instr_cat6b_t b;
-       instr_cat6c_t c;
-       instr_cat6d_t d;
-       instr_cat6ldgb_t ldgb;
-       instr_cat6stgb_t stgb;
-       struct PACKED {
-               /* dword0: */
-               uint32_t src_off  : 1;
-               uint32_t pad1     : 31;
-
-               /* dword1: */
-               uint32_t pad2     : 8;
-               uint32_t dst_off  : 1;
-               uint32_t pad3     : 8;
-               uint32_t type     : 3;
-               uint32_t g        : 1;  /* or in some cases it means dst immed */
-               uint32_t pad4     : 1;
-               uint32_t opc      : 5;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-       };
-} instr_cat6_t;
-
-typedef struct PACKED {
-       /* dword0: */
-       uint32_t pad1     : 32;
-
-       /* dword1: */
-       uint32_t pad2     : 12;
-       uint32_t ss       : 1;  /* maybe in the encoding, but blob only uses (sy) */
-       uint32_t pad3     : 6;
-       uint32_t w        : 1;  /* write */
-       uint32_t r        : 1;  /* read */
-       uint32_t l        : 1;  /* local */
-       uint32_t g        : 1;  /* global */
-       uint32_t opc      : 4;  /* presumed, but only a couple known OPCs */
-       uint32_t jmp_tgt  : 1;  /* (jp) */
-       uint32_t sync     : 1;  /* (sy) */
-       uint32_t opc_cat  : 3;
-} instr_cat7_t;
-
-typedef union PACKED {
-       instr_cat0_t cat0;
-       instr_cat1_t cat1;
-       instr_cat2_t cat2;
-       instr_cat3_t cat3;
-       instr_cat4_t cat4;
-       instr_cat5_t cat5;
-       instr_cat6_t cat6;
-       instr_cat7_t cat7;
-       struct PACKED {
-               /* dword0: */
-               uint32_t pad1     : 32;
-
-               /* dword1: */
-               uint32_t pad2     : 12;
-               uint32_t ss       : 1;  /* cat1-cat4 (cat0??) and cat7 (?) */
-               uint32_t ul       : 1;  /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
-               uint32_t pad3     : 13;
-               uint32_t jmp_tgt  : 1;
-               uint32_t sync     : 1;
-               uint32_t opc_cat  : 3;
-
-       };
-} instr_t;
-
-static inline uint32_t instr_repeat(instr_t *instr)
-{
-       switch (instr->opc_cat) {
-       case 0:  return instr->cat0.repeat;
-       case 1:  return instr->cat1.repeat;
-       case 2:  return instr->cat2.repeat;
-       case 3:  return instr->cat3.repeat;
-       case 4:  return instr->cat4.repeat;
-       default: return 0;
-       }
-}
-
-static inline bool instr_sat(instr_t *instr)
-{
-       switch (instr->opc_cat) {
-       case 2:  return instr->cat2.sat;
-       case 3:  return instr->cat3.sat;
-       case 4:  return instr->cat4.sat;
-       default: return false;
-       }
-}
-
-static inline uint32_t instr_opc(instr_t *instr)
-{
-       switch (instr->opc_cat) {
-       case 0:  return instr->cat0.opc;
-       case 1:  return 0;
-       case 2:  return instr->cat2.opc;
-       case 3:  return instr->cat3.opc;
-       case 4:  return instr->cat4.opc;
-       case 5:  return instr->cat5.opc;
-       case 6:  return instr->cat6.opc;
-       case 7:  return instr->cat7.opc;
-       default: return 0;
-       }
-}
-
-static inline bool is_mad(opc_t opc)
-{
-       switch (opc) {
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_MAD_U24:
-       case OPC_MAD_S24:
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_madsh(opc_t opc)
-{
-       switch (opc) {
-       case OPC_MADSH_U16:
-       case OPC_MADSH_M16:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_atomic(opc_t opc)
-{
-       switch (opc) {
-       case OPC_ATOMIC_ADD:
-       case OPC_ATOMIC_SUB:
-       case OPC_ATOMIC_XCHG:
-       case OPC_ATOMIC_INC:
-       case OPC_ATOMIC_DEC:
-       case OPC_ATOMIC_CMPXCHG:
-       case OPC_ATOMIC_MIN:
-       case OPC_ATOMIC_MAX:
-       case OPC_ATOMIC_AND:
-       case OPC_ATOMIC_OR:
-       case OPC_ATOMIC_XOR:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_ssbo(opc_t opc)
-{
-       switch (opc) {
-       case OPC_RESFMT:
-       case OPC_RESINFO:
-       case OPC_LDGB:
-       case OPC_STGB:
-       case OPC_STIB:
-               return true;
-       default:
-               return false;
-       }
-}
-
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
-
-#endif /* INSTR_A3XX_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c

deleted file mode 100644 (file)

index 3d1c444..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir3.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdbool.h>
-#include <errno.h>
-
-#include "util/bitscan.h"
-#include "util/ralloc.h"
-#include "util/u_math.h"
-
-#include "instr-a3xx.h"
-
-/* simple allocator to carve allocations out of an up-front allocated heap,
- * so that we can free everything easily in one shot.
- */
-void * ir3_alloc(struct ir3 *shader, int sz)
-{
-       return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
-}
-
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               unsigned nin, unsigned nout)
-{
-       struct ir3 *shader = rzalloc(compiler, struct ir3);
-
-       shader->compiler = compiler;
-       shader->ninputs = nin;
-       shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
-
-       shader->noutputs = nout;
-       shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
-
-       list_inithead(&shader->block_list);
-       list_inithead(&shader->array_list);
-
-       return shader;
-}
-
-void ir3_destroy(struct ir3 *shader)
-{
-       ralloc_free(shader);
-}
-
-#define iassert(cond) do { \
-       if (!(cond)) { \
-               debug_assert(cond); \
-               return -1; \
-       } } while (0)
-
-#define iassert_type(reg, full) do { \
-       if ((full)) { \
-               iassert(!((reg)->flags & IR3_REG_HALF)); \
-       } else { \
-               iassert((reg)->flags & IR3_REG_HALF); \
-       } } while (0);
-
-static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
-               uint32_t repeat, uint32_t valid_flags)
-{
-       reg_t val = { .dummy32 = 0 };
-
-       if (reg->flags & ~valid_flags) {
-               debug_printf("INVALID FLAGS: %x vs %x\n",
-                               reg->flags, valid_flags);
-       }
-
-       if (!(reg->flags & IR3_REG_R))
-               repeat = 0;
-
-       if (reg->flags & IR3_REG_IMMED) {
-               val.iim_val = reg->iim_val;
-       } else {
-               unsigned components;
-               int16_t max;
-
-               if (reg->flags & IR3_REG_RELATIV) {
-                       components = reg->size;
-                       val.idummy10 = reg->array.offset;
-                       max = (reg->array.offset + repeat + components - 1) >> 2;
-               } else {
-                       components = util_last_bit(reg->wrmask);
-                       val.comp = reg->num & 0x3;
-                       val.num  = reg->num >> 2;
-                       max = (reg->num + repeat + components - 1) >> 2;
-               }
-
-               if (reg->flags & IR3_REG_CONST) {
-                       info->max_const = MAX2(info->max_const, max);
-               } else if (val.num == 63) {
-                       /* ignore writes to dummy register r63.x */
-               } else if (max < 48) {
-                       if (reg->flags & IR3_REG_HALF) {
-                               if (info->gpu_id >= 600) {
-                                       /* starting w/ a6xx, half regs conflict with full regs: */
-                                       info->max_reg = MAX2(info->max_reg, (max+1)/2);
-                               } else {
-                                       info->max_half_reg = MAX2(info->max_half_reg, max);
-                               }
-                       } else {
-                               info->max_reg = MAX2(info->max_reg, max);
-                       }
-               }
-       }
-
-       return val.dummy32;
-}
-
-static int emit_cat0(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       instr_cat0_t *cat0 = ptr;
-
-       if (info->gpu_id >= 500) {
-               cat0->a5xx.immed = instr->cat0.immed;
-       } else if (info->gpu_id >= 400) {
-               cat0->a4xx.immed = instr->cat0.immed;
-       } else {
-               cat0->a3xx.immed = instr->cat0.immed;
-       }
-       cat0->repeat   = instr->repeat;
-       cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat0->inv      = instr->cat0.inv;
-       cat0->comp     = instr->cat0.comp;
-       cat0->opc      = instr->opc;
-       cat0->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat0->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat0->opc_cat  = 0;
-
-       return 0;
-}
-
-static int emit_cat1(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
-       instr_cat1_t *cat1 = ptr;
-
-       iassert(instr->regs_count == 2);
-       iassert_type(dst, type_size(instr->cat1.dst_type) == 32);
-       if (!(src->flags & IR3_REG_IMMED))
-               iassert_type(src, type_size(instr->cat1.src_type) == 32);
-
-       if (src->flags & IR3_REG_IMMED) {
-               cat1->iim_val = src->iim_val;
-               cat1->src_im  = 1;
-       } else if (src->flags & IR3_REG_RELATIV) {
-               cat1->off       = reg(src, info, instr->repeat,
-                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
-               cat1->src_rel   = 1;
-               cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
-       } else {
-               cat1->src  = reg(src, info, instr->repeat,
-                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
-               cat1->src_c     = !!(src->flags & IR3_REG_CONST);
-       }
-
-       cat1->dst      = reg(dst, info, instr->repeat,
-                       IR3_REG_RELATIV | IR3_REG_EVEN |
-                       IR3_REG_R | IR3_REG_POS_INF | IR3_REG_HALF);
-       cat1->repeat   = instr->repeat;
-       cat1->src_r    = !!(src->flags & IR3_REG_R);
-       cat1->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat1->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat1->dst_type = instr->cat1.dst_type;
-       cat1->dst_rel  = !!(dst->flags & IR3_REG_RELATIV);
-       cat1->src_type = instr->cat1.src_type;
-       cat1->even     = !!(dst->flags & IR3_REG_EVEN);
-       cat1->pos_inf  = !!(dst->flags & IR3_REG_POS_INF);
-       cat1->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat1->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat1->opc_cat  = 1;
-
-       return 0;
-}
-
-static int emit_cat2(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       instr_cat2_t *cat2 = ptr;
-       unsigned absneg = ir3_cat2_absneg(instr->opc);
-
-       iassert((instr->regs_count == 2) || (instr->regs_count == 3));
-
-       if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->array.offset < (1 << 10));
-               cat2->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-                               IR3_REG_HALF | absneg);
-               cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-               cat2->rel1.src1_rel  = 1;
-       } else if (src1->flags & IR3_REG_CONST) {
-               iassert(src1->num < (1 << 12));
-               cat2->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-               cat2->c1.src1_c = 1;
-       } else {
-               iassert(src1->num < (1 << 11));
-               cat2->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
-                               absneg);
-       }
-       cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
-       cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-       cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
-       cat2->src1_r   = !!(src1->flags & IR3_REG_R);
-
-       if (src2) {
-               iassert((src2->flags & IR3_REG_IMMED) ||
-                               !((src1->flags ^ src2->flags) & IR3_REG_HALF));
-
-               if (src2->flags & IR3_REG_RELATIV) {
-                       iassert(src2->array.offset < (1 << 10));
-                       cat2->rel2.src2      = reg(src2, info, instr->repeat,
-                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-                                       IR3_REG_HALF | absneg);
-                       cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
-                       cat2->rel2.src2_rel  = 1;
-               } else if (src2->flags & IR3_REG_CONST) {
-                       iassert(src2->num < (1 << 12));
-                       cat2->c2.src2   = reg(src2, info, instr->repeat,
-                                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-                       cat2->c2.src2_c = 1;
-               } else {
-                       iassert(src2->num < (1 << 11));
-                       cat2->src2 = reg(src2, info, instr->repeat,
-                                       IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
-                                       absneg);
-               }
-
-               cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
-               cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-               cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
-               cat2->src2_r   = !!(src2->flags & IR3_REG_R);
-       }
-
-       cat2->dst      = reg(dst, info, instr->repeat,
-                       IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
-       cat2->repeat   = instr->repeat;
-       cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
-       cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
-       cat2->ei       = !!(dst->flags & IR3_REG_EI);
-       cat2->cond     = instr->cat2.condition;
-       cat2->full     = ! (src1->flags & IR3_REG_HALF);
-       cat2->opc      = instr->opc;
-       cat2->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat2->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat2->opc_cat  = 2;
-
-       return 0;
-}
-
-static int emit_cat3(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       struct ir3_register *src3 = instr->regs[3];
-       unsigned absneg = ir3_cat3_absneg(instr->opc);
-       instr_cat3_t *cat3 = ptr;
-       uint32_t src_flags = 0;
-
-       switch (instr->opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_U16:
-       case OPC_MAD_S16:
-       case OPC_SEL_B16:
-       case OPC_SEL_S16:
-       case OPC_SEL_F16:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:  // really??
-               src_flags |= IR3_REG_HALF;
-               break;
-       default:
-               break;
-       }
-
-       iassert(instr->regs_count == 4);
-       iassert(!((src1->flags ^ src_flags) & IR3_REG_HALF));
-       iassert(!((src2->flags ^ src_flags) & IR3_REG_HALF));
-       iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
-
-       if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->array.offset < (1 << 10));
-               cat3->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-                               IR3_REG_HALF | absneg);
-               cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
-               cat3->rel1.src1_rel  = 1;
-       } else if (src1->flags & IR3_REG_CONST) {
-               iassert(src1->num < (1 << 12));
-               cat3->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-               cat3->c1.src1_c = 1;
-       } else {
-               iassert(src1->num < (1 << 11));
-               cat3->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_R | IR3_REG_HALF | absneg);
-       }
-
-       cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-       cat3->src1_r   = !!(src1->flags & IR3_REG_R);
-
-       cat3->src2     = reg(src2, info, instr->repeat,
-                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
-       cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
-       cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-       cat3->src2_r   = !!(src2->flags & IR3_REG_R);
-
-
-       if (src3->flags & IR3_REG_RELATIV) {
-               iassert(src3->array.offset < (1 << 10));
-               cat3->rel2.src3      = reg(src3, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
-                               IR3_REG_HALF | absneg);
-               cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
-               cat3->rel2.src3_rel  = 1;
-       } else if (src3->flags & IR3_REG_CONST) {
-               iassert(src3->num < (1 << 12));
-               cat3->c2.src3   = reg(src3, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
-               cat3->c2.src3_c = 1;
-       } else {
-               iassert(src3->num < (1 << 11));
-               cat3->src3 = reg(src3, info, instr->repeat,
-                               IR3_REG_R | IR3_REG_HALF | absneg);
-       }
-
-       cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
-       cat3->src3_r   = !!(src3->flags & IR3_REG_R);
-
-       cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat3->repeat   = instr->repeat;
-       cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
-       cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
-       cat3->opc      = instr->opc;
-       cat3->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat3->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat3->opc_cat  = 3;
-
-       return 0;
-}
-
-static int emit_cat4(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
-       instr_cat4_t *cat4 = ptr;
-
-       iassert(instr->regs_count == 2);
-
-       if (src->flags & IR3_REG_RELATIV) {
-               iassert(src->array.offset < (1 << 10));
-               cat4->rel.src      = reg(src, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
-                               IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
-               cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
-               cat4->rel.src_rel  = 1;
-       } else if (src->flags & IR3_REG_CONST) {
-               iassert(src->num < (1 << 12));
-               cat4->c.src   = reg(src, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
-                               IR3_REG_R | IR3_REG_HALF);
-               cat4->c.src_c = 1;
-       } else {
-               iassert(src->num < (1 << 11));
-               cat4->src = reg(src, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
-                               IR3_REG_R | IR3_REG_HALF);
-       }
-
-       cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
-       cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
-       cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
-       cat4->src_r    = !!(src->flags & IR3_REG_R);
-
-       cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat4->repeat   = instr->repeat;
-       cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
-       cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
-       cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
-       cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
-       cat4->full     = ! (src->flags & IR3_REG_HALF);
-       cat4->opc      = instr->opc;
-       cat4->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat4->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat4->opc_cat  = 4;
-
-       return 0;
-}
-
-static int emit_cat5(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src1 = instr->regs[1];
-       struct ir3_register *src2 = instr->regs[2];
-       struct ir3_register *src3 = instr->regs[3];
-       instr_cat5_t *cat5 = ptr;
-
-       iassert_type(dst, type_size(instr->cat5.type) == 32)
-
-       assume(src1 || !src2);
-       assume(src2 || !src3);
-
-       if (src1) {
-               cat5->full = ! (src1->flags & IR3_REG_HALF);
-               cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
-       }
-
-       if (instr->flags & IR3_INSTR_S2EN) {
-               if (src2) {
-                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-                       cat5->s2en.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-               }
-               if (src3) {
-                       iassert(src3->flags & IR3_REG_HALF);
-                       cat5->s2en.src3 = reg(src3, info, instr->repeat, IR3_REG_HALF);
-               }
-               iassert(!(instr->cat5.samp | instr->cat5.tex));
-       } else {
-               iassert(!src3);
-               if (src2) {
-                       iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
-                       cat5->norm.src2 = reg(src2, info, instr->repeat, IR3_REG_HALF);
-               }
-               cat5->norm.samp = instr->cat5.samp;
-               cat5->norm.tex  = instr->cat5.tex;
-       }
-
-       cat5->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       cat5->wrmask   = dst->wrmask;
-       cat5->type     = instr->cat5.type;
-       cat5->is_3d    = !!(instr->flags & IR3_INSTR_3D);
-       cat5->is_a     = !!(instr->flags & IR3_INSTR_A);
-       cat5->is_s     = !!(instr->flags & IR3_INSTR_S);
-       cat5->is_s2en  = !!(instr->flags & IR3_INSTR_S2EN);
-       cat5->is_o     = !!(instr->flags & IR3_INSTR_O);
-       cat5->is_p     = !!(instr->flags & IR3_INSTR_P);
-       cat5->opc      = instr->opc;
-       cat5->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat5->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat5->opc_cat  = 5;
-
-       return 0;
-}
-
-static int emit_cat6(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       struct ir3_register *dst, *src1, *src2;
-       instr_cat6_t *cat6 = ptr;
-       bool type_full = type_size(instr->cat6.type) == 32;
-
-       cat6->type     = instr->cat6.type;
-       cat6->opc      = instr->opc;
-       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat6->g        = !!(instr->flags & IR3_INSTR_G);
-       cat6->opc_cat  = 6;
-
-       switch (instr->opc) {
-       case OPC_RESINFO:
-       case OPC_RESFMT:
-               iassert_type(instr->regs[0], type_full); /* dst */
-               iassert_type(instr->regs[1], type_full); /* src1 */
-               break;
-       case OPC_L2G:
-       case OPC_G2L:
-               iassert_type(instr->regs[0], true);      /* dst */
-               iassert_type(instr->regs[1], true);      /* src1 */
-               break;
-       case OPC_STG:
-       case OPC_STL:
-       case OPC_STP:
-       case OPC_STI:
-       case OPC_STLW:
-       case OPC_STIB:
-               /* no dst, so regs[0] is dummy */
-               iassert_type(instr->regs[1], true);      /* dst */
-               iassert_type(instr->regs[2], type_full); /* src1 */
-               iassert_type(instr->regs[3], true);      /* src2 */
-               break;
-       default:
-               iassert_type(instr->regs[0], type_full); /* dst */
-               iassert_type(instr->regs[1], true);      /* src1 */
-               if (instr->regs_count > 2)
-                       iassert_type(instr->regs[2], true);  /* src1 */
-               break;
-       }
-
-       /* the "dst" for a store instruction is (from the perspective
-        * of data flow in the shader, ie. register use/def, etc) in
-        * fact a register that is read by the instruction, rather
-        * than written:
-        */
-       if (is_store(instr)) {
-               iassert(instr->regs_count >= 3);
-
-               dst  = instr->regs[1];
-               src1 = instr->regs[2];
-               src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
-       } else {
-               iassert(instr->regs_count >= 2);
-
-               dst  = instr->regs[0];
-               src1 = instr->regs[1];
-               src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
-       }
-
-       /* TODO we need a more comprehensive list about which instructions
-        * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
-        * indicate to use the src_off encoding even if offset is zero
-        * (but then what to do about dst_off?)
-        */
-       if (is_atomic(instr->opc)) {
-               instr_cat6ldgb_t *ldgb = ptr;
-
-               /* maybe these two bits both determine the instruction encoding? */
-               cat6->src_off = false;
-
-               ldgb->d = instr->cat6.d - 1;
-               ldgb->typed = instr->cat6.typed;
-               ldgb->type_size = instr->cat6.iim_val - 1;
-
-               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-               if (ldgb->g) {
-                       struct ir3_register *src3 = instr->regs[3];
-                       struct ir3_register *src4 = instr->regs[4];
-
-                       /* first src is src_ssbo: */
-                       iassert(src1->flags & IR3_REG_IMMED);
-                       ldgb->src_ssbo = src1->uim_val;
-
-                       ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-                       ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
-                       ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-                       ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
-
-                       ldgb->src3 = reg(src4, info, instr->repeat, 0);
-                       ldgb->pad0 = 0x1;
-                       ldgb->pad3 = 0x1;
-               } else {
-                       ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
-                       ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
-                       ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-                       ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
-                       ldgb->pad0 = 0x1;
-                       ldgb->pad3 = 0x0;
-               }
-
-               return 0;
-       } else if (instr->opc == OPC_LDGB) {
-               struct ir3_register *src3 = instr->regs[3];
-               instr_cat6ldgb_t *ldgb = ptr;
-
-               /* maybe these two bits both determine the instruction encoding? */
-               cat6->src_off = false;
-
-               ldgb->d = instr->cat6.d - 1;
-               ldgb->typed = instr->cat6.typed;
-               ldgb->type_size = instr->cat6.iim_val - 1;
-
-               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-               /* first src is src_ssbo: */
-               iassert(src1->flags & IR3_REG_IMMED);
-               ldgb->src_ssbo = src1->uim_val;
-
-               /* then next two are src1/src2: */
-               ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-               ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
-               ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-               ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
-
-               ldgb->pad0 = 0x0;
-               ldgb->pad3 = 0x1;
-
-               return 0;
-       } else if (instr->opc == OPC_RESINFO) {
-               instr_cat6ldgb_t *ldgb = ptr;
-
-               ldgb->d = instr->cat6.d - 1;
-
-               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-
-               /* first src is src_ssbo: */
-               iassert(src1->flags & IR3_REG_IMMED);
-               ldgb->src_ssbo = src1->uim_val;
-
-               return 0;
-       } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
-               struct ir3_register *src3 = instr->regs[4];
-               instr_cat6stgb_t *stgb = ptr;
-
-               /* maybe these two bits both determine the instruction encoding? */
-               cat6->src_off = true;
-               stgb->pad3 = 0x2;
-
-               stgb->d = instr->cat6.d - 1;
-               stgb->typed = instr->cat6.typed;
-               stgb->type_size = instr->cat6.iim_val - 1;
-
-               /* first src is dst_ssbo: */
-               iassert(dst->flags & IR3_REG_IMMED);
-               stgb->dst_ssbo = dst->uim_val;
-
-               /* then src1/src2/src3: */
-               stgb->src1 = reg(src1, info, instr->repeat, 0);
-               stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-               stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
-               stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
-               stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
-
-               return 0;
-       } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
-                       (instr->opc == OPC_LDL)) {
-               instr_cat6a_t *cat6a = ptr;
-
-               cat6->src_off = true;
-
-               cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
-               cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
-               if (src2) {
-                       cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-                       cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
-               }
-               cat6a->off = instr->cat6.src_offset;
-       } else {
-               instr_cat6b_t *cat6b = ptr;
-
-               cat6->src_off = false;
-
-               cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED | IR3_REG_HALF);
-               cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
-               if (src2) {
-                       cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
-                       cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
-               }
-       }
-
-       if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
-                       (instr->opc == OPC_STL)) {
-               instr_cat6c_t *cat6c = ptr;
-               cat6->dst_off = true;
-               cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-               cat6c->off = instr->cat6.dst_offset;
-       } else {
-               instr_cat6d_t *cat6d = ptr;
-               cat6->dst_off = false;
-               cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-       }
-
-       return 0;
-}
-
-static int emit_cat7(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info)
-{
-       instr_cat7_t *cat7 = ptr;
-
-       cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
-       cat7->w       = instr->cat7.w;
-       cat7->r       = instr->cat7.r;
-       cat7->l       = instr->cat7.l;
-       cat7->g       = instr->cat7.g;
-       cat7->opc     = instr->opc;
-       cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
-       cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
-       cat7->opc_cat = 7;
-
-       return 0;
-}
-
-static int (*emit[])(struct ir3_instruction *instr, void *ptr,
-               struct ir3_info *info) = {
-       emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
-       emit_cat7,
-};
-
-void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
-               uint32_t gpu_id)
-{
-       uint32_t *ptr, *dwords;
-
-       info->gpu_id        = gpu_id;
-       info->max_reg       = -1;
-       info->max_half_reg  = -1;
-       info->max_const     = -1;
-       info->instrs_count  = 0;
-       info->sizedwords    = 0;
-       info->ss = info->sy = 0;
-
-       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       info->sizedwords += 2;
-               }
-       }
-
-       /* need an integer number of instruction "groups" (sets of 16
-        * instructions on a4xx or sets of 4 instructions on a3xx),
-        * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
-        */
-       if (gpu_id >= 400) {
-               info->sizedwords = align(info->sizedwords, 16 * 2);
-       } else {
-               info->sizedwords = align(info->sizedwords, 4 * 2);
-       }
-
-       ptr = dwords = calloc(4, info->sizedwords);
-
-       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
-                       if (ret)
-                               goto fail;
-                       info->instrs_count += 1 + instr->repeat;
-                       dwords += 2;
-
-                       if (instr->flags & IR3_INSTR_SS)
-                               info->ss++;
-
-                       if (instr->flags & IR3_INSTR_SY)
-                               info->sy++;
-               }
-       }
-
-       return ptr;
-
-fail:
-       free(ptr);
-       return NULL;
-}
-
-static struct ir3_register * reg_create(struct ir3 *shader,
-               int num, int flags)
-{
-       struct ir3_register *reg =
-                       ir3_alloc(shader, sizeof(struct ir3_register));
-       reg->wrmask = 1;
-       reg->flags = flags;
-       reg->num = num;
-       return reg;
-}
-
-static void insert_instr(struct ir3_block *block,
-               struct ir3_instruction *instr)
-{
-       struct ir3 *shader = block->shader;
-#ifdef DEBUG
-       instr->serialno = ++shader->instr_count;
-#endif
-       list_addtail(&instr->node, &block->instr_list);
-
-       if (is_input(instr))
-               array_insert(shader, shader->baryfs, instr);
-}
-
-struct ir3_block * ir3_block_create(struct ir3 *shader)
-{
-       struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
-#ifdef DEBUG
-       block->serialno = ++shader->block_count;
-#endif
-       block->shader = shader;
-       list_inithead(&block->node);
-       list_inithead(&block->instr_list);
-       return block;
-}
-
-static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
-{
-       struct ir3_instruction *instr;
-       unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
-       char *ptr = ir3_alloc(block->shader, sz);
-
-       instr = (struct ir3_instruction *)ptr;
-       ptr  += sizeof(*instr);
-       instr->regs = (struct ir3_register **)ptr;
-
-#ifdef DEBUG
-       instr->regs_max = nreg;
-#endif
-
-       return instr;
-}
-
-struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-               opc_t opc, int nreg)
-{
-       struct ir3_instruction *instr = instr_create(block, nreg);
-       instr->block = block;
-       instr->opc = opc;
-       insert_instr(block, instr);
-       return instr;
-}
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
-{
-       /* NOTE: we could be slightly more clever, at least for non-meta,
-        * and choose # of regs based on category.
-        */
-       return ir3_instr_create2(block, opc, 4);
-}
-
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *new_instr = instr_create(instr->block,
-                       instr->regs_count);
-       struct ir3_register **regs;
-       unsigned i;
-
-       regs = new_instr->regs;
-       *new_instr = *instr;
-       new_instr->regs = regs;
-
-       insert_instr(instr->block, new_instr);
-
-       /* clone registers: */
-       new_instr->regs_count = 0;
-       for (i = 0; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               struct ir3_register *new_reg =
-                               ir3_reg_create(new_instr, reg->num, reg->flags);
-               *new_reg = *reg;
-       }
-
-       return new_instr;
-}
-
-/* Add a false dependency to instruction, to ensure it is scheduled first: */
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
-{
-       array_insert(instr, instr->deps, dep);
-}
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-               int num, int flags)
-{
-       struct ir3 *shader = instr->block->shader;
-       struct ir3_register *reg = reg_create(shader, num, flags);
-#ifdef DEBUG
-       debug_assert(instr->regs_count < instr->regs_max);
-#endif
-       instr->regs[instr->regs_count++] = reg;
-       return reg;
-}
-
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-               struct ir3_register *reg)
-{
-       struct ir3_register *new_reg = reg_create(shader, 0, 0);
-       *new_reg = *reg;
-       return new_reg;
-}
-
-void
-ir3_instr_set_address(struct ir3_instruction *instr,
-               struct ir3_instruction *addr)
-{
-       if (instr->address != addr) {
-               struct ir3 *ir = instr->block->shader;
-               instr->address = addr;
-               array_insert(ir, ir->indirects, instr);
-       }
-}
-
-void
-ir3_block_clear_mark(struct ir3_block *block)
-{
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
-               instr->flags &= ~IR3_INSTR_MARK;
-}
-
-void
-ir3_clear_mark(struct ir3 *ir)
-{
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               ir3_block_clear_mark(block);
-       }
-}
-
-/* note: this will destroy instr->depth, don't do it until after sched! */
-unsigned
-ir3_count_instructions(struct ir3 *ir)
-{
-       unsigned cnt = 0;
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       instr->ip = cnt++;
-               }
-               block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
-               block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
-       }
-       return cnt;
-}
-
-struct ir3_array *
-ir3_lookup_array(struct ir3 *ir, unsigned id)
-{
-       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
-               if (arr->id == id)
-                       return arr;
-       return NULL;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h

deleted file mode 100644 (file)

index ea32188..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ /dev/null
@@ -1,1394 +0,0 @@
-/*
- * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR3_H_
-#define IR3_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "compiler/shader_enums.h"
-
-#include "util/u_debug.h"
-#include "util/list.h"
-
-#include "instr-a3xx.h"
-
-/* low level intermediate representation of an adreno shader program */
-
-struct ir3_compiler;
-struct ir3;
-struct ir3_instruction;
-struct ir3_block;
-
-struct ir3_info {
-       uint32_t gpu_id;
-       uint16_t sizedwords;
-       uint16_t instrs_count;   /* expanded to account for rpt's */
-       /* NOTE: max_reg, etc, does not include registers not touched
-        * by the shader (ie. vertex fetched via VFD_DECODE but not
-        * touched by shader)
-        */
-       int8_t   max_reg;   /* highest GPR # used by shader */
-       int8_t   max_half_reg;
-       int16_t  max_const;
-
-       /* number of sync bits: */
-       uint16_t ss, sy;
-};
-
-struct ir3_register {
-       enum {
-               IR3_REG_CONST  = 0x001,
-               IR3_REG_IMMED  = 0x002,
-               IR3_REG_HALF   = 0x004,
-               /* high registers are used for some things in compute shaders,
-                * for example.  Seems to be for things that are global to all
-                * threads in a wave, so possibly these are global/shared by
-                * all the threads in the wave?
-                */
-               IR3_REG_HIGH   = 0x008,
-               IR3_REG_RELATIV= 0x010,
-               IR3_REG_R      = 0x020,
-               /* Most instructions, it seems, can do float abs/neg but not
-                * integer.  The CP pass needs to know what is intended (int or
-                * float) in order to do the right thing.  For this reason the
-                * abs/neg flags are split out into float and int variants.  In
-                * addition, .b (bitwise) operations, the negate is actually a
-                * bitwise not, so split that out into a new flag to make it
-                * more clear.
-                */
-               IR3_REG_FNEG   = 0x040,
-               IR3_REG_FABS   = 0x080,
-               IR3_REG_SNEG   = 0x100,
-               IR3_REG_SABS   = 0x200,
-               IR3_REG_BNOT   = 0x400,
-               IR3_REG_EVEN   = 0x800,
-               IR3_REG_POS_INF= 0x1000,
-               /* (ei) flag, end-input?  Set on last bary, presumably to signal
-                * that the shader needs no more input:
-                */
-               IR3_REG_EI     = 0x2000,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
-               IR3_REG_ARRAY  = 0x8000,
-
-       } flags;
-
-       /* normal registers:
-        * the component is in the low two bits of the reg #, so
-        * rN.x becomes: (N << 2) | x
-        */
-       int   num;
-       union {
-               /* immediate: */
-               int32_t  iim_val;
-               uint32_t uim_val;
-               float    fim_val;
-               /* relative: */
-               struct {
-                       uint16_t id;
-                       int16_t offset;
-               } array;
-       };
-
-       /* For IR3_REG_SSA, src registers contain ptr back to assigning
-        * instruction.
-        *
-        * For IR3_REG_ARRAY, the pointer is back to the last dependent
-        * array access (although the net effect is the same, it points
-        * back to a previous instruction that we depend on).
-        */
-       struct ir3_instruction *instr;
-
-       union {
-               /* used for cat5 instructions, but also for internal/IR level
-                * tracking of what registers are read/written by an instruction.
-                * wrmask may be a bad name since it is used to represent both
-                * src and dst that touch multiple adjacent registers.
-                */
-               unsigned wrmask;
-               /* for relative addressing, 32bits for array size is too small,
-                * but otoh we don't need to deal with disjoint sets, so instead
-                * use a simple size field (number of scalar components).
-                */
-               unsigned size;
-       };
-};
-
-/*
- * Stupid/simple growable array implementation:
- */
-#define DECLARE_ARRAY(type, name) \
-       unsigned name ## _count, name ## _sz; \
-       type * name;
-
-#define array_insert(ctx, arr, val) do { \
-               if (arr ## _count == arr ## _sz) { \
-                       arr ## _sz = MAX2(2 * arr ## _sz, 16); \
-                       arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
-               } \
-               arr[arr ##_count++] = val; \
-       } while (0)
-
-struct ir3_instruction {
-       struct ir3_block *block;
-       opc_t opc;
-       enum {
-               /* (sy) flag is set on first instruction, and after sample
-                * instructions (probably just on RAW hazard).
-                */
-               IR3_INSTR_SY    = 0x001,
-               /* (ss) flag is set on first instruction, and first instruction
-                * to depend on the result of "long" instructions (RAW hazard):
-                *
-                *   rcp, rsq, log2, exp2, sin, cos, sqrt
-                *
-                * It seems to synchronize until all in-flight instructions are
-                * completed, for example:
-                *
-                *   rsq hr1.w, hr1.w
-                *   add.f hr2.z, (neg)hr2.z, hc0.y
-                *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
-                *   rsq hr2.x, hr2.x
-                *   (rpt1)nop
-                *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
-                *   nop
-                *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
-                *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
-                *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
-                *
-                * The last mul.f does not have (ss) set, presumably because the
-                * (ss) on the previous instruction does the job.
-                *
-                * The blob driver also seems to set it on WAR hazards, although
-                * not really clear if this is needed or just blob compiler being
-                * sloppy.  So far I haven't found a case where removing the (ss)
-                * causes problems for WAR hazard, but I could just be getting
-                * lucky:
-                *
-                *   rcp r1.y, r3.y
-                *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
-                *
-                */
-               IR3_INSTR_SS    = 0x002,
-               /* (jp) flag is set on jump targets:
-                */
-               IR3_INSTR_JP    = 0x004,
-               IR3_INSTR_UL    = 0x008,
-               IR3_INSTR_3D    = 0x010,
-               IR3_INSTR_A     = 0x020,
-               IR3_INSTR_O     = 0x040,
-               IR3_INSTR_P     = 0x080,
-               IR3_INSTR_S     = 0x100,
-               IR3_INSTR_S2EN  = 0x200,
-               IR3_INSTR_G     = 0x400,
-               IR3_INSTR_SAT   = 0x800,
-               /* meta-flags, for intermediate stages of IR, ie.
-                * before register assignment is done:
-                */
-               IR3_INSTR_MARK  = 0x1000,
-               IR3_INSTR_UNUSED= 0x2000,
-       } flags;
-       int repeat;
-#ifdef DEBUG
-       unsigned regs_max;
-#endif
-       unsigned regs_count;
-       struct ir3_register **regs;
-       union {
-               struct {
-                       char inv;
-                       char comp;
-                       int  immed;
-                       struct ir3_block *target;
-               } cat0;
-               struct {
-                       type_t src_type, dst_type;
-               } cat1;
-               struct {
-                       enum {
-                               IR3_COND_LT = 0,
-                               IR3_COND_LE = 1,
-                               IR3_COND_GT = 2,
-                               IR3_COND_GE = 3,
-                               IR3_COND_EQ = 4,
-                               IR3_COND_NE = 5,
-                       } condition;
-               } cat2;
-               struct {
-                       unsigned samp, tex;
-                       type_t type;
-               } cat5;
-               struct {
-                       type_t type;
-                       int src_offset;
-                       int dst_offset;
-                       int iim_val : 3;      /* for ldgb/stgb, # of components */
-                       int d : 3;
-                       bool typed : 1;
-               } cat6;
-               struct {
-                       unsigned w : 1;       /* write */
-                       unsigned r : 1;       /* read */
-                       unsigned l : 1;       /* local */
-                       unsigned g : 1;       /* global */
-               } cat7;
-               /* for meta-instructions, just used to hold extra data
-                * before instruction scheduling, etc
-                */
-               struct {
-                       int off;              /* component/offset */
-               } fo;
-               struct {
-                       struct ir3_block *block;
-               } inout;
-       };
-
-       /* transient values used during various algorithms: */
-       union {
-               /* The instruction depth is the max dependency distance to output.
-                *
-                * You can also think of it as the "cost", if we did any sort of
-                * optimization for register footprint.  Ie. a value that is  just
-                * result of moving a const to a reg would have a low cost,  so to
-                * it could make sense to duplicate the instruction at various
-                * points where the result is needed to reduce register footprint.
-                */
-               unsigned depth;
-               /* When we get to the RA stage, we no longer need depth, but
-                * we do need instruction's position/name:
-                */
-               struct {
-                       uint16_t ip;
-                       uint16_t name;
-               };
-       };
-
-       /* used for per-pass extra instruction data.
-        */
-       void *data;
-
-       /* Used during CP and RA stages.  For fanin and shader inputs/
-        * outputs where we need a sequence of consecutive registers,
-        * keep track of each src instructions left (ie 'n-1') and right
-        * (ie 'n+1') neighbor.  The front-end must insert enough mov's
-        * to ensure that each instruction has at most one left and at
-        * most one right neighbor.  During the copy-propagation pass,
-        * we only remove mov's when we can preserve this constraint.
-        * And during the RA stage, we use the neighbor information to
-        * allocate a block of registers in one shot.
-        *
-        * TODO: maybe just add something like:
-        *   struct ir3_instruction_ref {
-        *       struct ir3_instruction *instr;
-        *       unsigned cnt;
-        *   }
-        *
-        * Or can we get away without the refcnt stuff?  It seems like
-        * it should be overkill..  the problem is if, potentially after
-        * already eliminating some mov's, if you have a single mov that
-        * needs to be grouped with it's neighbors in two different
-        * places (ex. shader output and a fanin).
-        */
-       struct {
-               struct ir3_instruction *left, *right;
-               uint16_t left_cnt, right_cnt;
-       } cp;
-
-       /* an instruction can reference at most one address register amongst
-        * it's src/dst registers.  Beyond that, you need to insert mov's.
-        *
-        * NOTE: do not write this directly, use ir3_instr_set_address()
-        */
-       struct ir3_instruction *address;
-
-       /* Tracking for additional dependent instructions.  Used to handle
-        * barriers, WAR hazards for arrays/SSBOs/etc.
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, deps);
-
-       /*
-        * From PoV of instruction scheduling, not execution (ie. ignores global/
-        * local distinction):
-        *                            shared  image  atomic  SSBO  everything
-        *   barrier()/            -   R/W     R/W    R/W     R/W       X
-        *     groupMemoryBarrier()
-        *   memoryBarrier()       -           R/W    R/W
-        *     (but only images declared coherent?)
-        *   memoryBarrierAtomic() -                  R/W
-        *   memoryBarrierBuffer() -                          R/W
-        *   memoryBarrierImage()  -           R/W
-        *   memoryBarrierShared() -   R/W
-        *
-        * TODO I think for SSBO/image/shared, in cases where we can determine
-        * which variable is accessed, we don't need to care about accesses to
-        * different variables (unless declared coherent??)
-        */
-       enum {
-               IR3_BARRIER_EVERYTHING = 1 << 0,
-               IR3_BARRIER_SHARED_R   = 1 << 1,
-               IR3_BARRIER_SHARED_W   = 1 << 2,
-               IR3_BARRIER_IMAGE_R    = 1 << 3,
-               IR3_BARRIER_IMAGE_W    = 1 << 4,
-               IR3_BARRIER_BUFFER_R   = 1 << 5,
-               IR3_BARRIER_BUFFER_W   = 1 << 6,
-               IR3_BARRIER_ARRAY_R    = 1 << 7,
-               IR3_BARRIER_ARRAY_W    = 1 << 8,
-       } barrier_class, barrier_conflict;
-
-       /* Entry in ir3_block's instruction list: */
-       struct list_head node;
-
-       int use_count;      /* currently just updated/used by cp */
-
-#ifdef DEBUG
-       uint32_t serialno;
-#endif
-};
-
-static inline struct ir3_instruction *
-ir3_neighbor_first(struct ir3_instruction *instr)
-{
-       int cnt = 0;
-       while (instr->cp.left) {
-               instr = instr->cp.left;
-               if (++cnt > 0xffff) {
-                       debug_assert(0);
-                       break;
-               }
-       }
-       return instr;
-}
-
-static inline int ir3_neighbor_count(struct ir3_instruction *instr)
-{
-       int num = 1;
-
-       debug_assert(!instr->cp.left);
-
-       while (instr->cp.right) {
-               num++;
-               instr = instr->cp.right;
-               if (num > 0xffff) {
-                       debug_assert(0);
-                       break;
-               }
-       }
-
-       return num;
-}
-
-struct ir3 {
-       struct ir3_compiler *compiler;
-
-       unsigned ninputs, noutputs;
-       struct ir3_instruction **inputs;
-       struct ir3_instruction **outputs;
-
-       /* Track bary.f (and ldlv) instructions.. this is needed in
-        * scheduling to ensure that all varying fetches happen before
-        * any potential kill instructions.  The hw gets grumpy if all
-        * threads in a group are killed before the last bary.f gets
-        * a chance to signal end of input (ei).
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, baryfs);
-
-       /* Track all indirect instructions (read and write).  To avoid
-        * deadlock scenario where an address register gets scheduled,
-        * but other dependent src instructions cannot be scheduled due
-        * to dependency on a *different* address register value, the
-        * scheduler needs to ensure that all dependencies other than
-        * the instruction other than the address register are scheduled
-        * before the one that writes the address register.  Having a
-        * convenient list of instructions that reference some address
-        * register simplifies this.
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, indirects);
-
-       /* and same for instructions that consume predicate register: */
-       DECLARE_ARRAY(struct ir3_instruction *, predicates);
-
-       /* Track texture sample instructions which need texture state
-        * patched in (for astc-srgb workaround):
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
-
-       /* List of blocks: */
-       struct list_head block_list;
-
-       /* List of ir3_array's: */
-       struct list_head array_list;
-
-#ifdef DEBUG
-       unsigned block_count, instr_count;
-#endif
-};
-
-struct ir3_array {
-       struct list_head node;
-       unsigned length;
-       unsigned id;
-
-       struct nir_register *r;
-
-       /* To avoid array write's from getting DCE'd, keep track of the
-        * most recent write.  Any array access depends on the most
-        * recent write.  This way, nothing depends on writes after the
-        * last read.  But all the writes that happen before that have
-        * something depending on them
-        */
-       struct ir3_instruction *last_write;
-
-       /* extra stuff used in RA pass: */
-       unsigned base;      /* base vreg name */
-       unsigned reg;       /* base physical reg */
-       uint16_t start_ip, end_ip;
-};
-
-struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
-
-struct ir3_block {
-       struct list_head node;
-       struct ir3 *shader;
-
-       const struct nir_block *nblock;
-
-       struct list_head instr_list;  /* list of ir3_instruction */
-
-       /* each block has either one or two successors.. in case of
-        * two successors, 'condition' decides which one to follow.
-        * A block preceding an if/else has two successors.
-        */
-       struct ir3_instruction *condition;
-       struct ir3_block *successors[2];
-
-       unsigned predecessors_count;
-       struct ir3_block **predecessors;
-
-       uint16_t start_ip, end_ip;
-
-       /* Track instructions which do not write a register but other-
-        * wise must not be discarded (such as kill, stg, etc)
-        */
-       DECLARE_ARRAY(struct ir3_instruction *, keeps);
-
-       /* used for per-pass extra block data.  Mainly used right
-        * now in RA step to track livein/liveout.
-        */
-       void *data;
-
-#ifdef DEBUG
-       uint32_t serialno;
-#endif
-};
-
-static inline uint32_t
-block_id(struct ir3_block *block)
-{
-#ifdef DEBUG
-       return block->serialno;
-#else
-       return (uint32_t)(unsigned long)block;
-#endif
-}
-
-struct ir3 * ir3_create(struct ir3_compiler *compiler,
-               unsigned nin, unsigned nout);
-void ir3_destroy(struct ir3 *shader);
-void * ir3_assemble(struct ir3 *shader,
-               struct ir3_info *info, uint32_t gpu_id);
-void * ir3_alloc(struct ir3 *shader, int sz);
-
-struct ir3_block * ir3_block_create(struct ir3 *shader);
-
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
-struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
-               opc_t opc, int nreg);
-struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
-void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
-const char *ir3_instr_name(struct ir3_instruction *instr);
-
-struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
-               int num, int flags);
-struct ir3_register * ir3_reg_clone(struct ir3 *shader,
-               struct ir3_register *reg);
-
-void ir3_instr_set_address(struct ir3_instruction *instr,
-               struct ir3_instruction *addr);
-
-static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
-{
-       if (instr->flags & IR3_INSTR_MARK)
-               return true;  /* already visited */
-       instr->flags |= IR3_INSTR_MARK;
-       return false;
-}
-
-void ir3_block_clear_mark(struct ir3_block *block);
-void ir3_clear_mark(struct ir3 *shader);
-
-unsigned ir3_count_instructions(struct ir3 *ir);
-
-static inline int ir3_instr_regno(struct ir3_instruction *instr,
-               struct ir3_register *reg)
-{
-       unsigned i;
-       for (i = 0; i < instr->regs_count; i++)
-               if (reg == instr->regs[i])
-                       return i;
-       return -1;
-}
-
-
-#define MAX_ARRAYS 16
-
-/* comp:
- *   0 - x
- *   1 - y
- *   2 - z
- *   3 - w
- */
-static inline uint32_t regid(int num, int comp)
-{
-       return (num << 2) | (comp & 0x3);
-}
-
-static inline uint32_t reg_num(struct ir3_register *reg)
-{
-       return reg->num >> 2;
-}
-
-static inline uint32_t reg_comp(struct ir3_register *reg)
-{
-       return reg->num & 0x3;
-}
-
-static inline bool is_flow(struct ir3_instruction *instr)
-{
-       return (opc_cat(instr->opc) == 0);
-}
-
-static inline bool is_kill(struct ir3_instruction *instr)
-{
-       return instr->opc == OPC_KILL;
-}
-
-static inline bool is_nop(struct ir3_instruction *instr)
-{
-       return instr->opc == OPC_NOP;
-}
-
-/* Is it a non-transformative (ie. not type changing) mov?  This can
- * also include absneg.s/absneg.f, which for the most part can be
- * treated as a mov (single src argument).
- */
-static inline bool is_same_type_mov(struct ir3_instruction *instr)
-{
-       struct ir3_register *dst;
-
-       switch (instr->opc) {
-       case OPC_MOV:
-               if (instr->cat1.src_type != instr->cat1.dst_type)
-                       return false;
-               break;
-       case OPC_ABSNEG_F:
-       case OPC_ABSNEG_S:
-               if (instr->flags & IR3_INSTR_SAT)
-                       return false;
-               break;
-       default:
-               return false;
-       }
-
-       dst = instr->regs[0];
-
-       /* mov's that write to a0.x or p0.x are special: */
-       if (dst->num == regid(REG_P0, 0))
-               return false;
-       if (dst->num == regid(REG_A0, 0))
-               return false;
-
-       if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
-               return false;
-
-       return true;
-}
-
-static inline bool is_alu(struct ir3_instruction *instr)
-{
-       return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
-}
-
-static inline bool is_sfu(struct ir3_instruction *instr)
-{
-       return (opc_cat(instr->opc) == 4);
-}
-
-static inline bool is_tex(struct ir3_instruction *instr)
-{
-       return (opc_cat(instr->opc) == 5);
-}
-
-static inline bool is_mem(struct ir3_instruction *instr)
-{
-       return (opc_cat(instr->opc) == 6);
-}
-
-static inline bool is_barrier(struct ir3_instruction *instr)
-{
-       return (opc_cat(instr->opc) == 7);
-}
-
-static inline bool
-is_store(struct ir3_instruction *instr)
-{
-       /* these instructions, the "destination" register is
-        * actually a source, the address to store to.
-        */
-       switch (instr->opc) {
-       case OPC_STG:
-       case OPC_STGB:
-       case OPC_STIB:
-       case OPC_STP:
-       case OPC_STL:
-       case OPC_STLW:
-       case OPC_L2G:
-       case OPC_G2L:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_load(struct ir3_instruction *instr)
-{
-       switch (instr->opc) {
-       case OPC_LDG:
-       case OPC_LDGB:
-       case OPC_LDL:
-       case OPC_LDP:
-       case OPC_L2G:
-       case OPC_LDLW:
-       case OPC_LDC:
-       case OPC_LDLV:
-               /* probably some others too.. */
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_input(struct ir3_instruction *instr)
-{
-       /* in some cases, ldlv is used to fetch varying without
-        * interpolation.. fortunately inloc is the first src
-        * register in either case
-        */
-       switch (instr->opc) {
-       case OPC_LDLV:
-       case OPC_BARY_F:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_bool(struct ir3_instruction *instr)
-{
-       switch (instr->opc) {
-       case OPC_CMPS_F:
-       case OPC_CMPS_S:
-       case OPC_CMPS_U:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool is_meta(struct ir3_instruction *instr)
-{
-       /* TODO how should we count PHI (and maybe fan-in/out) which
-        * might actually contribute some instructions to the final
-        * result?
-        */
-       return (opc_cat(instr->opc) == -1);
-}
-
-static inline bool writes_addr(struct ir3_instruction *instr)
-{
-       if (instr->regs_count > 0) {
-               struct ir3_register *dst = instr->regs[0];
-               return reg_num(dst) == REG_A0;
-       }
-       return false;
-}
-
-static inline bool writes_pred(struct ir3_instruction *instr)
-{
-       if (instr->regs_count > 0) {
-               struct ir3_register *dst = instr->regs[0];
-               return reg_num(dst) == REG_P0;
-       }
-       return false;
-}
-
-/* returns defining instruction for reg */
-/* TODO better name */
-static inline struct ir3_instruction *ssa(struct ir3_register *reg)
-{
-       if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
-               return reg->instr;
-       }
-       return NULL;
-}
-
-static inline bool conflicts(struct ir3_instruction *a,
-               struct ir3_instruction *b)
-{
-       return (a && b) && (a != b);
-}
-
-static inline bool reg_gpr(struct ir3_register *r)
-{
-       if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-               return false;
-       if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
-               return false;
-       return true;
-}
-
-static inline type_t half_type(type_t type)
-{
-       switch (type) {
-       case TYPE_F32: return TYPE_F16;
-       case TYPE_U32: return TYPE_U16;
-       case TYPE_S32: return TYPE_S16;
-       case TYPE_F16:
-       case TYPE_U16:
-       case TYPE_S16:
-               return type;
-       default:
-               assert(0);
-               return ~0;
-       }
-}
-
-/* some cat2 instructions (ie. those which are not float) can embed an
- * immediate:
- */
-static inline bool ir3_cat2_int(opc_t opc)
-{
-       switch (opc) {
-       case OPC_ADD_U:
-       case OPC_ADD_S:
-       case OPC_SUB_U:
-       case OPC_SUB_S:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_MIN_U:
-       case OPC_MIN_S:
-       case OPC_MAX_U:
-       case OPC_MAX_S:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-       case OPC_MUL_U:
-       case OPC_MUL_S:
-       case OPC_MULL_U:
-       case OPC_CLZ_S:
-       case OPC_ABSNEG_S:
-       case OPC_AND_B:
-       case OPC_OR_B:
-       case OPC_NOT_B:
-       case OPC_XOR_B:
-       case OPC_BFREV_B:
-       case OPC_CLZ_B:
-       case OPC_SHL_B:
-       case OPC_SHR_B:
-       case OPC_ASHR_B:
-       case OPC_MGEN_B:
-       case OPC_GETBIT_B:
-       case OPC_CBITS_B:
-       case OPC_BARY_F:
-               return true;
-
-       default:
-               return false;
-       }
-}
-
-
-/* map cat2 instruction to valid abs/neg flags: */
-static inline unsigned ir3_cat2_absneg(opc_t opc)
-{
-       switch (opc) {
-       case OPC_ADD_F:
-       case OPC_MIN_F:
-       case OPC_MAX_F:
-       case OPC_MUL_F:
-       case OPC_SIGN_F:
-       case OPC_CMPS_F:
-       case OPC_ABSNEG_F:
-       case OPC_CMPV_F:
-       case OPC_FLOOR_F:
-       case OPC_CEIL_F:
-       case OPC_RNDNE_F:
-       case OPC_RNDAZ_F:
-       case OPC_TRUNC_F:
-       case OPC_BARY_F:
-               return IR3_REG_FABS | IR3_REG_FNEG;
-
-       case OPC_ADD_U:
-       case OPC_ADD_S:
-       case OPC_SUB_U:
-       case OPC_SUB_S:
-       case OPC_CMPS_U:
-       case OPC_CMPS_S:
-       case OPC_MIN_U:
-       case OPC_MIN_S:
-       case OPC_MAX_U:
-       case OPC_MAX_S:
-       case OPC_CMPV_U:
-       case OPC_CMPV_S:
-       case OPC_MUL_U:
-       case OPC_MUL_S:
-       case OPC_MULL_U:
-       case OPC_CLZ_S:
-               return 0;
-
-       case OPC_ABSNEG_S:
-               return IR3_REG_SABS | IR3_REG_SNEG;
-
-       case OPC_AND_B:
-       case OPC_OR_B:
-       case OPC_NOT_B:
-       case OPC_XOR_B:
-       case OPC_BFREV_B:
-       case OPC_CLZ_B:
-       case OPC_SHL_B:
-       case OPC_SHR_B:
-       case OPC_ASHR_B:
-       case OPC_MGEN_B:
-       case OPC_GETBIT_B:
-       case OPC_CBITS_B:
-               return IR3_REG_BNOT;
-
-       default:
-               return 0;
-       }
-}
-
-/* map cat3 instructions to valid abs/neg flags: */
-static inline unsigned ir3_cat3_absneg(opc_t opc)
-{
-       switch (opc) {
-       case OPC_MAD_F16:
-       case OPC_MAD_F32:
-       case OPC_SEL_F16:
-       case OPC_SEL_F32:
-               return IR3_REG_FNEG;
-
-       case OPC_MAD_U16:
-       case OPC_MADSH_U16:
-       case OPC_MAD_S16:
-       case OPC_MADSH_M16:
-       case OPC_MAD_U24:
-       case OPC_MAD_S24:
-       case OPC_SEL_S16:
-       case OPC_SEL_S32:
-       case OPC_SAD_S16:
-       case OPC_SAD_S32:
-               /* neg *may* work on 3rd src.. */
-
-       case OPC_SEL_B16:
-       case OPC_SEL_B32:
-
-       default:
-               return 0;
-       }
-}
-
-#define MASK(n) ((1 << (n)) - 1)
-
-/* iterator for an instructions's sources (reg), also returns src #: */
-#define foreach_src_n(__srcreg, __n, __instr) \
-       if ((__instr)->regs_count) \
-               for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
-                       if ((__srcreg = (__instr)->regs[__n + 1]))
-
-/* iterator for an instructions's sources (reg): */
-#define foreach_src(__srcreg, __instr) \
-       foreach_src_n(__srcreg, __i, __instr)
-
-static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
-{
-       unsigned cnt = instr->regs_count + instr->deps_count;
-       if (instr->address)
-               cnt++;
-       return cnt;
-}
-
-static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
-{
-       if (n == (instr->regs_count + instr->deps_count))
-               return instr->address;
-       if (n >= instr->regs_count)
-               return instr->deps[n - instr->regs_count];
-       return ssa(instr->regs[n]);
-}
-
-static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
-{
-       if (n == (instr->regs_count + instr->deps_count))
-               return false;
-       if (n >= instr->regs_count)
-               return true;
-       return false;
-}
-
-#define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
-
-/* iterator for an instruction's SSA sources (instr), also returns src #: */
-#define foreach_ssa_src_n(__srcinst, __n, __instr) \
-       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
-               if ((__srcinst = __ssa_src_n(__instr, __n)))
-
-/* iterator for an instruction's SSA sources (instr): */
-#define foreach_ssa_src(__srcinst, __instr) \
-       foreach_ssa_src_n(__srcinst, __i, __instr)
-
-
-/* dump: */
-void ir3_print(struct ir3 *ir);
-void ir3_print_instr(struct ir3_instruction *instr);
-
-/* depth calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n);
-void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
-void ir3_depth(struct ir3 *ir);
-
-/* copy-propagate: */
-struct ir3_shader_variant;
-void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
-
-/* group neighbors and insert mov's to resolve conflicts: */
-void ir3_group(struct ir3 *ir);
-
-/* scheduling: */
-void ir3_sched_add_deps(struct ir3 *ir);
-int ir3_sched(struct ir3 *ir);
-
-/* register assignment: */
-struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
-int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
-               bool frag_coord, bool frag_face);
-
-/* legalize: */
-void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
-
-/* ************************************************************************* */
-/* instruction helpers */
-
-/* creates SSA src of correct type (ie. half vs full precision) */
-static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
-               struct ir3_instruction *src, unsigned flags)
-{
-       struct ir3_register *reg;
-       if (src->regs[0]->flags & IR3_REG_HALF)
-               flags |= IR3_REG_HALF;
-       reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
-       reg->instr = src;
-       return reg;
-}
-
-static inline struct ir3_instruction *
-ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
-{
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-       ir3_reg_create(instr, 0, 0);   /* dst */
-       if (src->regs[0]->flags & IR3_REG_ARRAY) {
-               struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
-               src_reg->array = src->regs[0]->array;
-       } else {
-               __ssa_src(instr, src, 0);
-       }
-       debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
-       instr->cat1.src_type = type;
-       instr->cat1.dst_type = type;
-       return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
-               type_t src_type, type_t dst_type)
-{
-       struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
-       unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
-       unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
-
-       debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
-
-       ir3_reg_create(instr, 0, dst_flags);   /* dst */
-       __ssa_src(instr, src, 0);
-       instr->cat1.src_type = src_type;
-       instr->cat1.dst_type = dst_type;
-       debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
-       return instr;
-}
-
-static inline struct ir3_instruction *
-ir3_NOP(struct ir3_block *block)
-{
-       return ir3_instr_create(block, OPC_NOP);
-}
-
-#define INSTR0(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block)                                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create(block, OPC_##name);                             \
-       return instr;                                                        \
-}
-
-#define INSTR1(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-               struct ir3_instruction *a, unsigned aflags)                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create(block, OPC_##name);                             \
-       ir3_reg_create(instr, 0, 0);   /* dst */                             \
-       __ssa_src(instr, a, aflags);                                         \
-       return instr;                                                        \
-}
-
-#define INSTR2(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-               struct ir3_instruction *a, unsigned aflags,                      \
-               struct ir3_instruction *b, unsigned bflags)                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create(block, OPC_##name);                             \
-       ir3_reg_create(instr, 0, 0);   /* dst */                             \
-       __ssa_src(instr, a, aflags);                                         \
-       __ssa_src(instr, b, bflags);                                         \
-       return instr;                                                        \
-}
-
-#define INSTR3(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-               struct ir3_instruction *a, unsigned aflags,                      \
-               struct ir3_instruction *b, unsigned bflags,                      \
-               struct ir3_instruction *c, unsigned cflags)                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create(block, OPC_##name);                             \
-       ir3_reg_create(instr, 0, 0);   /* dst */                             \
-       __ssa_src(instr, a, aflags);                                         \
-       __ssa_src(instr, b, bflags);                                         \
-       __ssa_src(instr, c, cflags);                                         \
-       return instr;                                                        \
-}
-
-#define INSTR4(name)                                                     \
-static inline struct ir3_instruction *                                   \
-ir3_##name(struct ir3_block *block,                                      \
-               struct ir3_instruction *a, unsigned aflags,                      \
-               struct ir3_instruction *b, unsigned bflags,                      \
-               struct ir3_instruction *c, unsigned cflags,                      \
-               struct ir3_instruction *d, unsigned dflags)                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create2(block, OPC_##name, 5);                         \
-       ir3_reg_create(instr, 0, 0);   /* dst */                             \
-       __ssa_src(instr, a, aflags);                                         \
-       __ssa_src(instr, b, bflags);                                         \
-       __ssa_src(instr, c, cflags);                                         \
-       __ssa_src(instr, d, dflags);                                         \
-       return instr;                                                        \
-}
-
-#define INSTR4F(f, name)                                                 \
-static inline struct ir3_instruction *                                   \
-ir3_##name##_##f(struct ir3_block *block,                                \
-               struct ir3_instruction *a, unsigned aflags,                      \
-               struct ir3_instruction *b, unsigned bflags,                      \
-               struct ir3_instruction *c, unsigned cflags,                      \
-               struct ir3_instruction *d, unsigned dflags)                      \
-{                                                                        \
-       struct ir3_instruction *instr =                                      \
-               ir3_instr_create2(block, OPC_##name, 5);                         \
-       ir3_reg_create(instr, 0, 0);   /* dst */                             \
-       __ssa_src(instr, a, aflags);                                         \
-       __ssa_src(instr, b, bflags);                                         \
-       __ssa_src(instr, c, cflags);                                         \
-       __ssa_src(instr, d, dflags);                                         \
-       instr->flags |= IR3_INSTR_##f;                                       \
-       return instr;                                                        \
-}
-
-/* cat0 instructions: */
-INSTR0(BR)
-INSTR0(JUMP)
-INSTR1(KILL)
-INSTR0(END)
-
-/* cat2 instructions, most 2 src but some 1 src: */
-INSTR2(ADD_F)
-INSTR2(MIN_F)
-INSTR2(MAX_F)
-INSTR2(MUL_F)
-INSTR1(SIGN_F)
-INSTR2(CMPS_F)
-INSTR1(ABSNEG_F)
-INSTR2(CMPV_F)
-INSTR1(FLOOR_F)
-INSTR1(CEIL_F)
-INSTR1(RNDNE_F)
-INSTR1(RNDAZ_F)
-INSTR1(TRUNC_F)
-INSTR2(ADD_U)
-INSTR2(ADD_S)
-INSTR2(SUB_U)
-INSTR2(SUB_S)
-INSTR2(CMPS_U)
-INSTR2(CMPS_S)
-INSTR2(MIN_U)
-INSTR2(MIN_S)
-INSTR2(MAX_U)
-INSTR2(MAX_S)
-INSTR1(ABSNEG_S)
-INSTR2(AND_B)
-INSTR2(OR_B)
-INSTR1(NOT_B)
-INSTR2(XOR_B)
-INSTR2(CMPV_U)
-INSTR2(CMPV_S)
-INSTR2(MUL_U)
-INSTR2(MUL_S)
-INSTR2(MULL_U)
-INSTR1(BFREV_B)
-INSTR1(CLZ_S)
-INSTR1(CLZ_B)
-INSTR2(SHL_B)
-INSTR2(SHR_B)
-INSTR2(ASHR_B)
-INSTR2(BARY_F)
-INSTR2(MGEN_B)
-INSTR2(GETBIT_B)
-INSTR1(SETRM)
-INSTR1(CBITS_B)
-INSTR2(SHB)
-INSTR2(MSAD)
-
-/* cat3 instructions: */
-INSTR3(MAD_U16)
-INSTR3(MADSH_U16)
-INSTR3(MAD_S16)
-INSTR3(MADSH_M16)
-INSTR3(MAD_U24)
-INSTR3(MAD_S24)
-INSTR3(MAD_F16)
-INSTR3(MAD_F32)
-INSTR3(SEL_B16)
-INSTR3(SEL_B32)
-INSTR3(SEL_S16)
-INSTR3(SEL_S32)
-INSTR3(SEL_F16)
-INSTR3(SEL_F32)
-INSTR3(SAD_S16)
-INSTR3(SAD_S32)
-
-/* cat4 instructions: */
-INSTR1(RCP)
-INSTR1(RSQ)
-INSTR1(LOG2)
-INSTR1(EXP2)
-INSTR1(SIN)
-INSTR1(COS)
-INSTR1(SQRT)
-
-/* cat5 instructions: */
-INSTR1(DSX)
-INSTR1(DSY)
-
-static inline struct ir3_instruction *
-ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
-               unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
-               struct ir3_instruction *src0, struct ir3_instruction *src1)
-{
-       struct ir3_instruction *sam;
-       struct ir3_register *reg;
-
-       sam = ir3_instr_create(block, opc);
-       sam->flags |= flags;
-       ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
-       if (src0) {
-               reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
-               reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
-               reg->instr = src0;
-       }
-       if (src1) {
-               reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
-               reg->instr = src1;
-               reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
-       }
-       sam->cat5.samp = samp;
-       sam->cat5.tex  = tex;
-       sam->cat5.type  = type;
-
-       return sam;
-}
-
-/* cat6 instructions: */
-INSTR2(LDLV)
-INSTR2(LDG)
-INSTR2(LDL)
-INSTR3(STG)
-INSTR3(STL)
-INSTR3(LDGB)
-INSTR4(STGB)
-INSTR4(STIB)
-INSTR1(RESINFO)
-INSTR1(RESFMT)
-INSTR2(ATOMIC_ADD)
-INSTR2(ATOMIC_SUB)
-INSTR2(ATOMIC_XCHG)
-INSTR2(ATOMIC_INC)
-INSTR2(ATOMIC_DEC)
-INSTR2(ATOMIC_CMPXCHG)
-INSTR2(ATOMIC_MIN)
-INSTR2(ATOMIC_MAX)
-INSTR2(ATOMIC_AND)
-INSTR2(ATOMIC_OR)
-INSTR2(ATOMIC_XOR)
-INSTR4F(G, ATOMIC_ADD)
-INSTR4F(G, ATOMIC_SUB)
-INSTR4F(G, ATOMIC_XCHG)
-INSTR4F(G, ATOMIC_INC)
-INSTR4F(G, ATOMIC_DEC)
-INSTR4F(G, ATOMIC_CMPXCHG)
-INSTR4F(G, ATOMIC_MIN)
-INSTR4F(G, ATOMIC_MAX)
-INSTR4F(G, ATOMIC_AND)
-INSTR4F(G, ATOMIC_OR)
-INSTR4F(G, ATOMIC_XOR)
-
-/* cat7 instructions: */
-INSTR0(BAR)
-INSTR0(FENCE)
-
-/* ************************************************************************* */
-/* split this out or find some helper to use.. like main/bitset.h.. */
-
-#include <string.h>
-
-#define MAX_REG 256
-
-typedef uint8_t regmask_t[2 * MAX_REG / 8];
-
-static inline unsigned regmask_idx(struct ir3_register *reg)
-{
-       unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
-       debug_assert(num < MAX_REG);
-       if (reg->flags & IR3_REG_HALF)
-               num += MAX_REG;
-       return num;
-}
-
-static inline void regmask_init(regmask_t *regmask)
-{
-       memset(regmask, 0, sizeof(*regmask));
-}
-
-static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
-{
-       unsigned idx = regmask_idx(reg);
-       if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       (*regmask)[idx / 8] |= 1 << (idx % 8);
-       } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-                       if (mask & 1)
-                               (*regmask)[idx / 8] |= 1 << (idx % 8);
-       }
-}
-
-static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
-{
-       unsigned i;
-       for (i = 0; i < ARRAY_SIZE(*dst); i++)
-               (*dst)[i] = (*a)[i] | (*b)[i];
-}
-
-/* set bits in a if not set in b, conceptually:
- *   a |= (reg & ~b)
- */
-static inline void regmask_set_if_not(regmask_t *a,
-               struct ir3_register *reg, regmask_t *b)
-{
-       unsigned idx = regmask_idx(reg);
-       if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       if (!((*b)[idx / 8] & (1 << (idx % 8))))
-                               (*a)[idx / 8] |= 1 << (idx % 8);
-       } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-                       if (mask & 1)
-                               if (!((*b)[idx / 8] & (1 << (idx % 8))))
-                                       (*a)[idx / 8] |= 1 << (idx % 8);
-       }
-}
-
-static inline bool regmask_get(regmask_t *regmask,
-               struct ir3_register *reg)
-{
-       unsigned idx = regmask_idx(reg);
-       if (reg->flags & IR3_REG_RELATIV) {
-               unsigned i;
-               for (i = 0; i < reg->size; i++, idx++)
-                       if ((*regmask)[idx / 8] & (1 << (idx % 8)))
-                               return true;
-       } else {
-               unsigned mask;
-               for (mask = reg->wrmask; mask; mask >>= 1, idx++)
-                       if (mask & 1)
-                               if ((*regmask)[idx / 8] & (1 << (idx % 8)))
-                                       return true;
-       }
-       return false;
-}
-
-/* ************************************************************************* */
-
-#endif /* IR3_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cache.h b/src/gallium/drivers/freedreno/ir3/ir3_cache.h

index 3d3a7f8050dde491900f9e7be505c98bbd302e03..73d555e92ceac50eadd44fcfccb4ea07fea3a2f0 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_cache.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cache.h
@@ -27,7 +27,7 @@
  #ifndef IR3_CACHE_H_
  #define IR3_CACHE_H_
  
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  /*
   * An in-memory cache for mapping shader state objects plus shader key to
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c

index d12cdd353ab64ebb7b77ca27fc32aefe99d8e31c..47fd5dfd01293d152f074903f4e6b4901434e78d 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -37,11 +37,11 @@
  #include "tgsi/tgsi_text.h"
  #include "tgsi/tgsi_dump.h"
  
-#include "ir3_compiler.h"
-#include "ir3_gallium.h"
-#include "ir3_nir.h"
-#include "instr-a3xx.h"
-#include "ir3.h"
+#include "ir3/ir3_compiler.h"
+#include "ir3/ir3_gallium.h"
+#include "ir3/ir3_nir.h"
+#include "ir3/instr-a3xx.h"
+#include "ir3/ir3.h"
  
  #include "compiler/glsl/standalone.h"
  #include "compiler/glsl/glsl_to_nir.h"
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c

deleted file mode 100644 (file)

index f00daeb..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/ralloc.h"
-
-#include "ir3_compiler.h"
-
-static const struct debug_named_value shader_debug_options[] = {
-               {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
-               {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
-               {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
-               {"disasm",  IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
-               {"optmsgs", IR3_DBG_OPTMSGS,"Enable optimizer debug messages"},
-               DEBUG_NAMED_VALUE_END
-};
-
-DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
-
-enum ir3_shader_debug ir3_shader_debug = 0;
-
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
-{
-       struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
-
-       ir3_shader_debug = debug_get_option_ir3_shader_debug();
-
-       compiler->dev = dev;
-       compiler->gpu_id = gpu_id;
-       compiler->set = ir3_ra_alloc_reg_set(compiler);
-
-       if (compiler->gpu_id >= 400) {
-               /* need special handling for "flat" */
-               compiler->flat_bypass = true;
-               compiler->levels_add_one = false;
-               compiler->unminify_coords = false;
-               compiler->txf_ms_with_isaml = false;
-               compiler->array_index_add_half = true;
-       } else {
-               /* no special handling for "flat" */
-               compiler->flat_bypass = false;
-               compiler->levels_add_one = true;
-               compiler->unminify_coords = true;
-               compiler->txf_ms_with_isaml = true;
-               compiler->array_index_add_half = false;
-       }
-
-       return compiler;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h

deleted file mode 100644 (file)

index e233606..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef IR3_COMPILER_H_
-#define IR3_COMPILER_H_
-
-#include "ir3_shader.h"
-
-struct ir3_ra_reg_set;
-
-struct ir3_compiler {
-       struct fd_device *dev;
-       uint32_t gpu_id;
-       struct ir3_ra_reg_set *set;
-       uint32_t shader_count;
-
-       /*
-        * Configuration options for things that are handled differently on
-        * different generations:
-        */
-
-       /* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
-        * so we need to use ldlv.u32 to load the varying directly:
-        */
-       bool flat_bypass;
-
-       /* on a3xx, we need to add one to # of array levels:
-        */
-       bool levels_add_one;
-
-       /* on a3xx, we need to scale up integer coords for isaml based
-        * on LoD:
-        */
-       bool unminify_coords;
-
-       /* on a3xx do txf_ms w/ isaml and scaled coords: */
-       bool txf_ms_with_isaml;
-
-       /* on a4xx, for array textures we need to add 0.5 to the array
-        * index coordinate:
-        */
-       bool array_index_add_half;
-};
-
-struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
-
-int ir3_compile_shader_nir(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so);
-
-enum ir3_shader_debug {
-       IR3_DBG_SHADER_VS = 0x01,
-       IR3_DBG_SHADER_FS = 0x02,
-       IR3_DBG_SHADER_CS = 0x04,
-       IR3_DBG_DISASM    = 0x08,
-       IR3_DBG_OPTMSGS   = 0x10,
-};
-
-extern enum ir3_shader_debug ir3_shader_debug;
-
-static inline bool
-shader_debug_enabled(gl_shader_stage type)
-{
-       switch (type) {
-       case MESA_SHADER_VERTEX:      return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
-       case MESA_SHADER_FRAGMENT:    return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
-       case MESA_SHADER_COMPUTE:     return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
-       default:
-               debug_assert(0);
-               return false;
-       }
-}
-
-#endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c

deleted file mode 100644 (file)

index 445a2b2..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ /dev/null
@@ -1,3818 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-
-#include "ir3_compiler.h"
-#include "ir3_shader.h"
-#include "ir3_nir.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-/* for conditionally setting boolean flag(s): */
-#define COND(bool, val) ((bool) ? (val) : 0)
-
-#define DBG(fmt, ...) \
-               do { debug_printf("%s:%d: "fmt "\n", \
-                               __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
-
-struct ir3_context {
-       struct ir3_compiler *compiler;
-
-       struct nir_shader *s;
-
-       struct nir_instr *cur_instr;  /* current instruction, just for debug */
-
-       struct ir3 *ir;
-       struct ir3_shader_variant *so;
-
-       struct ir3_block *block;      /* the current block */
-       struct ir3_block *in_block;   /* block created for shader inputs */
-
-       nir_function_impl *impl;
-
-       /* For fragment shaders, varyings are not actual shader inputs,
-        * instead the hw passes a varying-coord which is used with
-        * bary.f.
-        *
-        * But NIR doesn't know that, it still declares varyings as
-        * inputs.  So we do all the input tracking normally and fix
-        * things up after compile_instructions()
-        *
-        * NOTE that frag_vcoord is the hardware position (possibly it
-        * is actually an index or tag or some such.. it is *not*
-        * values that can be directly used for gl_FragCoord..)
-        */
-       struct ir3_instruction *frag_vcoord;
-
-       /* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
-       struct ir3_instruction *frag_face, *frag_coord;
-
-       /* For vertex shaders, keep track of the system values sources */
-       struct ir3_instruction *vertex_id, *basevertex, *instance_id;
-
-       /* For fragment shaders: */
-       struct ir3_instruction *samp_id, *samp_mask_in;
-
-       /* Compute shader inputs: */
-       struct ir3_instruction *local_invocation_id, *work_group_id;
-
-       /* mapping from nir_register to defining instruction: */
-       struct hash_table *def_ht;
-
-       unsigned num_arrays;
-
-       /* a common pattern for indirect addressing is to request the
-        * same address register multiple times.  To avoid generating
-        * duplicate instruction sequences (which our backend does not
-        * try to clean up, since that should be done as the NIR stage)
-        * we cache the address value generated for a given src value:
-        *
-        * Note that we have to cache these per alignment, since same
-        * src used for an array of vec1 cannot be also used for an
-        * array of vec4.
-        */
-       struct hash_table *addr_ht[4];
-
-       /* last dst array, for indirect we need to insert a var-store.
-        */
-       struct ir3_instruction **last_dst;
-       unsigned last_dst_n;
-
-       /* maps nir_block to ir3_block, mostly for the purposes of
-        * figuring out the blocks successors
-        */
-       struct hash_table *block_ht;
-
-       /* on a4xx, bitmask of samplers which need astc+srgb workaround: */
-       unsigned astc_srgb;
-
-       unsigned samples;             /* bitmask of x,y sample shifts */
-
-       unsigned max_texture_index;
-
-       /* set if we encounter something we can't handle yet, so we
-        * can bail cleanly and fallback to TGSI compiler f/e
-        */
-       bool error;
-};
-
-/* gpu pointer size in units of 32bit registers/slots */
-static unsigned pointer_size(struct ir3_context *ctx)
-{
-       return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
-}
-
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-static struct ir3_block * get_block(struct ir3_context *ctx, const nir_block *nblock);
-
-
-static struct ir3_context *
-compile_init(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so)
-{
-       struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
-
-       if (compiler->gpu_id >= 400) {
-               if (so->type == MESA_SHADER_VERTEX) {
-                       ctx->astc_srgb = so->key.vastc_srgb;
-               } else if (so->type == MESA_SHADER_FRAGMENT) {
-                       ctx->astc_srgb = so->key.fastc_srgb;
-               }
-
-       } else {
-               if (so->type == MESA_SHADER_VERTEX) {
-                       ctx->samples = so->key.vsamples;
-               } else if (so->type == MESA_SHADER_FRAGMENT) {
-                       ctx->samples = so->key.fsamples;
-               }
-       }
-
-       ctx->compiler = compiler;
-       ctx->so = so;
-       ctx->def_ht = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-       ctx->block_ht = _mesa_hash_table_create(ctx,
-                       _mesa_hash_pointer, _mesa_key_pointer_equal);
-
-       /* TODO: maybe generate some sort of bitmask of what key
-        * lowers vs what shader has (ie. no need to lower
-        * texture clamp lowering if no texture sample instrs)..
-        * although should be done further up the stack to avoid
-        * creating duplicate variants..
-        */
-
-       if (ir3_key_lowers_nir(&so->key)) {
-               nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
-               ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
-       } else {
-               /* fast-path for shader key that lowers nothing in NIR: */
-               ctx->s = so->shader->nir;
-       }
-
-       /* this needs to be the last pass run, so do this here instead of
-        * in ir3_optimize_nir():
-        */
-       NIR_PASS_V(ctx->s, nir_lower_locals_to_regs);
-       NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               printf("dump nir%dv%d: type=%d, k={cts=%u,hp=%u}",
-                       so->shader->id, so->id, so->type,
-                       so->key.color_two_side, so->key.half_precision);
-               nir_print_shader(ctx->s, stdout);
-       }
-
-       if (shader_debug_enabled(so->type)) {
-               fprintf(stderr, "NIR (final form) for %s shader:\n",
-                       _mesa_shader_stage_to_string(so->type));
-               nir_print_shader(ctx->s, stderr);
-       }
-
-       ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
-
-       so->num_uniforms = ctx->s->num_uniforms;
-       so->num_ubos = ctx->s->info.num_ubos;
-
-       /* Layout of constant registers, each section aligned to vec4.  Note
-        * that pointer size (ubo, etc) changes depending on generation.
-        *
-        *    user consts
-        *    UBO addresses
-        *    SSBO sizes
-        *    if (vertex shader) {
-        *        driver params (IR3_DP_*)
-        *        if (stream_output.num_outputs > 0)
-        *           stream-out addresses
-        *    }
-        *    immediates
-        *
-        * Immediates go last mostly because they are inserted in the CP pass
-        * after the nir -> ir3 frontend.
-        */
-       unsigned constoff = align(ctx->s->num_uniforms, 4);
-       unsigned ptrsz = pointer_size(ctx);
-
-       memset(&so->constbase, ~0, sizeof(so->constbase));
-
-       if (so->num_ubos > 0) {
-               so->constbase.ubo = constoff;
-               constoff += align(ctx->s->info.num_ubos * ptrsz, 4) / 4;
-       }
-
-       if (so->const_layout.ssbo_size.count > 0) {
-               unsigned cnt = so->const_layout.ssbo_size.count;
-               so->constbase.ssbo_sizes = constoff;
-               constoff += align(cnt, 4) / 4;
-       }
-
-       if (so->const_layout.image_dims.count > 0) {
-               unsigned cnt = so->const_layout.image_dims.count;
-               so->constbase.image_dims = constoff;
-               constoff += align(cnt, 4) / 4;
-       }
-
-       unsigned num_driver_params = 0;
-       if (so->type == MESA_SHADER_VERTEX) {
-               num_driver_params = IR3_DP_VS_COUNT;
-       } else if (so->type == MESA_SHADER_COMPUTE) {
-               num_driver_params = IR3_DP_CS_COUNT;
-       }
-
-       so->constbase.driver_param = constoff;
-       constoff += align(num_driver_params, 4) / 4;
-
-       if ((so->type == MESA_SHADER_VERTEX) &&
-                       (compiler->gpu_id < 500) &&
-                       so->shader->stream_output.num_outputs > 0) {
-               so->constbase.tfbo = constoff;
-               constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
-       }
-
-       so->constbase.immediate = constoff;
-
-       return ctx;
-}
-
-static void
-compile_error(struct ir3_context *ctx, const char *format, ...)
-{
-       struct hash_table *errors = NULL;
-       va_list ap;
-       va_start(ap, format);
-       if (ctx->cur_instr) {
-               errors = _mesa_hash_table_create(NULL,
-                               _mesa_hash_pointer,
-                               _mesa_key_pointer_equal);
-               char *msg = ralloc_vasprintf(errors, format, ap);
-               _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
-       } else {
-               _debug_vprintf(format, ap);
-       }
-       va_end(ap);
-       nir_print_shader_annotated(ctx->s, stdout, errors);
-       ralloc_free(errors);
-       ctx->error = true;
-       debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-               if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-       } while (0)
-
-static void
-compile_free(struct ir3_context *ctx)
-{
-       ralloc_free(ctx);
-}
-
-static void
-declare_array(struct ir3_context *ctx, nir_register *reg)
-{
-       struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
-       arr->id = ++ctx->num_arrays;
-       /* NOTE: sometimes we get non array regs, for example for arrays of
-        * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
-        * treat a non-array as if it was an array of length 1.
-        *
-        * It would be nice if there was a nir pass to convert arrays of
-        * length 1 to ssa.
-        */
-       arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
-       compile_assert(ctx, arr->length > 0);
-       arr->r = reg;
-       list_addtail(&arr->node, &ctx->ir->array_list);
-}
-
-static struct ir3_array *
-get_array(struct ir3_context *ctx, nir_register *reg)
-{
-       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-               if (arr->r == reg)
-                       return arr;
-       }
-       compile_error(ctx, "bogus reg: %s\n", reg->name);
-       return NULL;
-}
-
-/* relative (indirect) if address!=NULL */
-static struct ir3_instruction *
-create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
-               struct ir3_instruction *address)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *mov;
-       struct ir3_register *src;
-
-       mov = ir3_instr_create(block, OPC_MOV);
-       mov->cat1.src_type = TYPE_U32;
-       mov->cat1.dst_type = TYPE_U32;
-       mov->barrier_class = IR3_BARRIER_ARRAY_R;
-       mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
-       ir3_reg_create(mov, 0, 0);
-       src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-                       COND(address, IR3_REG_RELATIV));
-       src->instr = arr->last_write;
-       src->size  = arr->length;
-       src->array.id = arr->id;
-       src->array.offset = n;
-
-       if (address)
-               ir3_instr_set_address(mov, address);
-
-       return mov;
-}
-
-/* relative (indirect) if address!=NULL */
-static void
-create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
-               struct ir3_instruction *src, struct ir3_instruction *address)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *mov;
-       struct ir3_register *dst;
-
-       /* if not relative store, don't create an extra mov, since that
-        * ends up being difficult for cp to remove.
-        */
-       if (!address) {
-               dst = src->regs[0];
-
-               src->barrier_class |= IR3_BARRIER_ARRAY_W;
-               src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-
-               dst->flags |= IR3_REG_ARRAY;
-               dst->instr = arr->last_write;
-               dst->size = arr->length;
-               dst->array.id = arr->id;
-               dst->array.offset = n;
-
-               arr->last_write = src;
-
-               array_insert(block, block->keeps, src);
-
-               return;
-       }
-
-       mov = ir3_instr_create(block, OPC_MOV);
-       mov->cat1.src_type = TYPE_U32;
-       mov->cat1.dst_type = TYPE_U32;
-       mov->barrier_class = IR3_BARRIER_ARRAY_W;
-       mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
-       dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
-                       COND(address, IR3_REG_RELATIV));
-       dst->instr = arr->last_write;
-       dst->size  = arr->length;
-       dst->array.id = arr->id;
-       dst->array.offset = n;
-       ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-
-       if (address)
-               ir3_instr_set_address(mov, address);
-
-       arr->last_write = mov;
-
-       /* the array store may only matter to something in an earlier
-        * block (ie. loops), but since arrays are not in SSA, depth
-        * pass won't know this.. so keep all array stores:
-        */
-       array_insert(block, block->keeps, mov);
-}
-
-static inline type_t utype_for_size(unsigned bit_size)
-{
-       switch (bit_size) {
-       case 32: return TYPE_U32;
-       case 16: return TYPE_U16;
-       case  8: return TYPE_U8;
-       default: unreachable("bad bitsize"); return ~0;
-       }
-}
-
-static inline type_t utype_src(nir_src src)
-{ return utype_for_size(nir_src_bit_size(src)); }
-
-static inline type_t utype_dst(nir_dest dst)
-{ return utype_for_size(nir_dest_bit_size(dst)); }
-
-/* allocate a n element value array (to be populated by caller) and
- * insert in def_ht
- */
-static struct ir3_instruction **
-get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
-{
-       struct ir3_instruction **value =
-               ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
-       _mesa_hash_table_insert(ctx->def_ht, dst, value);
-       return value;
-}
-
-static struct ir3_instruction **
-get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
-{
-       struct ir3_instruction **value;
-
-       if (dst->is_ssa) {
-               value = get_dst_ssa(ctx, &dst->ssa, n);
-       } else {
-               value = ralloc_array(ctx, struct ir3_instruction *, n);
-       }
-
-       /* NOTE: in non-ssa case, we don't really need to store last_dst
-        * but this helps us catch cases where put_dst() call is forgotten
-        */
-       compile_assert(ctx, !ctx->last_dst);
-       ctx->last_dst = value;
-       ctx->last_dst_n = n;
-
-       return value;
-}
-
-static struct ir3_instruction * get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align);
-
-static struct ir3_instruction * const *
-get_src(struct ir3_context *ctx, nir_src *src)
-{
-       if (src->is_ssa) {
-               struct hash_entry *entry;
-               entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
-               compile_assert(ctx, entry);
-               return entry->data;
-       } else {
-               nir_register *reg = src->reg.reg;
-               struct ir3_array *arr = get_array(ctx, reg);
-               unsigned num_components = arr->r->num_components;
-               struct ir3_instruction *addr = NULL;
-               struct ir3_instruction **value =
-                       ralloc_array(ctx, struct ir3_instruction *, num_components);
-
-               if (src->reg.indirect)
-                       addr = get_addr(ctx, get_src(ctx, src->reg.indirect)[0],
-                                       reg->num_components);
-
-               for (unsigned i = 0; i < num_components; i++) {
-                       unsigned n = src->reg.base_offset * reg->num_components + i;
-                       compile_assert(ctx, n < arr->length);
-                       value[i] = create_array_load(ctx, arr, n, addr);
-               }
-
-               return value;
-       }
-}
-
-static void
-put_dst(struct ir3_context *ctx, nir_dest *dst)
-{
-       unsigned bit_size = nir_dest_bit_size(*dst);
-
-       if (bit_size < 32) {
-               for (unsigned i = 0; i < ctx->last_dst_n; i++) {
-                       struct ir3_instruction *dst = ctx->last_dst[i];
-                       dst->regs[0]->flags |= IR3_REG_HALF;
-                       if (ctx->last_dst[i]->opc == OPC_META_FO)
-                               dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
-               }
-       }
-
-       if (!dst->is_ssa) {
-               nir_register *reg = dst->reg.reg;
-               struct ir3_array *arr = get_array(ctx, reg);
-               unsigned num_components = ctx->last_dst_n;
-               struct ir3_instruction *addr = NULL;
-
-               if (dst->reg.indirect)
-                       addr = get_addr(ctx, get_src(ctx, dst->reg.indirect)[0],
-                                       reg->num_components);
-
-               for (unsigned i = 0; i < num_components; i++) {
-                       unsigned n = dst->reg.base_offset * reg->num_components + i;
-                       compile_assert(ctx, n < arr->length);
-                       if (!ctx->last_dst[i])
-                               continue;
-                       create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
-               }
-
-               ralloc_free(ctx->last_dst);
-       }
-       ctx->last_dst = NULL;
-       ctx->last_dst_n = 0;
-}
-
-static struct ir3_instruction *
-create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
-{
-       struct ir3_instruction *mov;
-       unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
-
-       mov = ir3_instr_create(block, OPC_MOV);
-       mov->cat1.src_type = type;
-       mov->cat1.dst_type = type;
-       ir3_reg_create(mov, 0, flags);
-       ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
-
-       return mov;
-}
-
-static struct ir3_instruction *
-create_immed(struct ir3_block *block, uint32_t val)
-{
-       return create_immed_typed(block, val, TYPE_U32);
-}
-
-static struct ir3_instruction *
-create_addr(struct ir3_block *block, struct ir3_instruction *src, int align)
-{
-       struct ir3_instruction *instr, *immed;
-
-       /* TODO in at least some cases, the backend could probably be
-        * made clever enough to propagate IR3_REG_HALF..
-        */
-       instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
-       instr->regs[0]->flags |= IR3_REG_HALF;
-
-       switch(align){
-       case 1:
-               /* src *= 1: */
-               break;
-       case 2:
-               /* src *= 2     => src <<= 1: */
-               immed = create_immed(block, 1);
-               immed->regs[0]->flags |= IR3_REG_HALF;
-
-               instr = ir3_SHL_B(block, instr, 0, immed, 0);
-               instr->regs[0]->flags |= IR3_REG_HALF;
-               instr->regs[1]->flags |= IR3_REG_HALF;
-               break;
-       case 3:
-               /* src *= 3: */
-               immed = create_immed(block, 3);
-               immed->regs[0]->flags |= IR3_REG_HALF;
-
-               instr = ir3_MULL_U(block, instr, 0, immed, 0);
-               instr->regs[0]->flags |= IR3_REG_HALF;
-               instr->regs[1]->flags |= IR3_REG_HALF;
-               break;
-       case 4:
-               /* src *= 4 => src <<= 2: */
-               immed = create_immed(block, 2);
-               immed->regs[0]->flags |= IR3_REG_HALF;
-
-               instr = ir3_SHL_B(block, instr, 0, immed, 0);
-               instr->regs[0]->flags |= IR3_REG_HALF;
-               instr->regs[1]->flags |= IR3_REG_HALF;
-               break;
-       default:
-               unreachable("bad align");
-               return NULL;
-       }
-
-       instr = ir3_MOV(block, instr, TYPE_S16);
-       instr->regs[0]->num = regid(REG_A0, 0);
-       instr->regs[0]->flags |= IR3_REG_HALF;
-       instr->regs[1]->flags |= IR3_REG_HALF;
-
-       return instr;
-}
-
-/* caches addr values to avoid generating multiple cov/shl/mova
- * sequences for each use of a given NIR level src as address
- */
-static struct ir3_instruction *
-get_addr(struct ir3_context *ctx, struct ir3_instruction *src, int align)
-{
-       struct ir3_instruction *addr;
-       unsigned idx = align - 1;
-
-       compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr_ht));
-
-       if (!ctx->addr_ht[idx]) {
-               ctx->addr_ht[idx] = _mesa_hash_table_create(ctx,
-                               _mesa_hash_pointer, _mesa_key_pointer_equal);
-       } else {
-               struct hash_entry *entry;
-               entry = _mesa_hash_table_search(ctx->addr_ht[idx], src);
-               if (entry)
-                       return entry->data;
-       }
-
-       addr = create_addr(ctx->block, src, align);
-       _mesa_hash_table_insert(ctx->addr_ht[idx], src, addr);
-
-       return addr;
-}
-
-static struct ir3_instruction *
-get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *cond;
-
-       /* NOTE: only cmps.*.* can write p0.x: */
-       cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
-       cond->cat2.condition = IR3_COND_NE;
-
-       /* condition always goes in predicate register: */
-       cond->regs[0]->num = regid(REG_P0, 0);
-
-       return cond;
-}
-
-static struct ir3_instruction *
-create_uniform(struct ir3_context *ctx, unsigned n)
-{
-       struct ir3_instruction *mov;
-
-       mov = ir3_instr_create(ctx->block, OPC_MOV);
-       /* TODO get types right? */
-       mov->cat1.src_type = TYPE_F32;
-       mov->cat1.dst_type = TYPE_F32;
-       ir3_reg_create(mov, 0, 0);
-       ir3_reg_create(mov, n, IR3_REG_CONST);
-
-       return mov;
-}
-
-static struct ir3_instruction *
-create_uniform_indirect(struct ir3_context *ctx, int n,
-               struct ir3_instruction *address)
-{
-       struct ir3_instruction *mov;
-
-       mov = ir3_instr_create(ctx->block, OPC_MOV);
-       mov->cat1.src_type = TYPE_U32;
-       mov->cat1.dst_type = TYPE_U32;
-       ir3_reg_create(mov, 0, 0);
-       ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
-
-       ir3_instr_set_address(mov, address);
-
-       return mov;
-}
-
-static struct ir3_instruction *
-create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
-               unsigned arrsz)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *collect;
-
-       if (arrsz == 0)
-               return NULL;
-
-       unsigned flags = arr[0]->regs[0]->flags & IR3_REG_HALF;
-
-       collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
-       ir3_reg_create(collect, 0, flags);     /* dst */
-       for (unsigned i = 0; i < arrsz; i++) {
-               struct ir3_instruction *elem = arr[i];
-
-               /* Since arrays are pre-colored in RA, we can't assume that
-                * things will end up in the right place.  (Ie. if a collect
-                * joins elements from two different arrays.)  So insert an
-                * extra mov.
-                *
-                * We could possibly skip this if all the collected elements
-                * are contiguous elements in a single array.. not sure how
-                * likely that is to happen.
-                *
-                * Fixes a problem with glamor shaders, that in effect do
-                * something like:
-                *
-                *   if (foo)
-                *     texcoord = ..
-                *   else
-                *     texcoord = ..
-                *   color = texture2D(tex, texcoord);
-                *
-                * In this case, texcoord will end up as nir registers (which
-                * translate to ir3 array's of length 1.  And we can't assume
-                * the two (or more) arrays will get allocated in consecutive
-                * scalar registers.
-                *
-                */
-               if (elem->regs[0]->flags & IR3_REG_ARRAY) {
-                       type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-                       elem = ir3_MOV(block, elem, type);
-               }
-
-               compile_assert(ctx, (elem->regs[0]->flags & IR3_REG_HALF) == flags);
-               ir3_reg_create(collect, 0, IR3_REG_SSA | flags)->instr = elem;
-       }
-
-       return collect;
-}
-
-static struct ir3_instruction *
-create_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n,
-               struct ir3_instruction *address, struct ir3_instruction *collect)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *mov;
-       struct ir3_register *src;
-
-       mov = ir3_instr_create(block, OPC_MOV);
-       mov->cat1.src_type = TYPE_U32;
-       mov->cat1.dst_type = TYPE_U32;
-       ir3_reg_create(mov, 0, 0);
-       src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
-       src->instr = collect;
-       src->size  = arrsz;
-       src->array.offset = n;
-
-       ir3_instr_set_address(mov, address);
-
-       return mov;
-}
-
-static struct ir3_instruction *
-create_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask)
-{
-       struct ir3_instruction *in;
-
-       in = ir3_instr_create(ctx->in_block, OPC_META_INPUT);
-       in->inout.block = ctx->in_block;
-       ir3_reg_create(in, n, 0);
-
-       in->regs[0]->wrmask = compmask;
-
-       return in;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_context *ctx, unsigned n)
-{
-       return create_input_compmask(ctx, n, 0x1);
-}
-
-static struct ir3_instruction *
-create_frag_input(struct ir3_context *ctx, bool use_ldlv)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *instr;
-       /* actual inloc is assigned and fixed up later: */
-       struct ir3_instruction *inloc = create_immed(block, 0);
-
-       if (use_ldlv) {
-               instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
-               instr->cat6.type = TYPE_U32;
-               instr->cat6.iim_val = 1;
-       } else {
-               instr = ir3_BARY_F(block, inloc, 0, ctx->frag_vcoord, 0);
-               instr->regs[2]->wrmask = 0x3;
-       }
-
-       return instr;
-}
-
-static struct ir3_instruction *
-create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
-{
-       /* first four vec4 sysval's reserved for UBOs: */
-       /* NOTE: dp is in scalar, but there can be >4 dp components: */
-       unsigned n = ctx->so->constbase.driver_param;
-       unsigned r = regid(n + dp / 4, dp % 4);
-       return create_uniform(ctx, r);
-}
-
-/* helper for instructions that produce multiple consecutive scalar
- * outputs which need to have a split/fanout meta instruction inserted
- */
-static void
-split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-               struct ir3_instruction *src, unsigned base, unsigned n)
-{
-       struct ir3_instruction *prev = NULL;
-
-       if ((n == 1) && (src->regs[0]->wrmask == 0x1)) {
-               dst[0] = src;
-               return;
-       }
-
-       for (int i = 0, j = 0; i < n; i++) {
-               struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
-               ir3_reg_create(split, 0, IR3_REG_SSA);
-               ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
-               split->fo.off = i + base;
-
-               if (prev) {
-                       split->cp.left = prev;
-                       split->cp.left_cnt++;
-                       prev->cp.right = split;
-                       prev->cp.right_cnt++;
-               }
-               prev = split;
-
-               if (src->regs[0]->wrmask & (1 << (i + base)))
-                       dst[j++] = split;
-       }
-}
-
-/*
- * Adreno uses uint rather than having dedicated bool type,
- * which (potentially) requires some conversion, in particular
- * when using output of an bool instr to int input, or visa
- * versa.
- *
- *         | Adreno  |  NIR  |
- *  -------+---------+-------+-
- *   true  |    1    |  ~0   |
- *   false |    0    |   0   |
- *
- * To convert from an adreno bool (uint) to nir, use:
- *
- *    absneg.s dst, (neg)src
- *
- * To convert back in the other direction:
- *
- *    absneg.s dst, (abs)arc
- *
- * The CP step can clean up the absneg.s that cancel each other
- * out, and with a slight bit of extra cleverness (to recognize
- * the instructions which produce either a 0 or 1) can eliminate
- * the absneg.s's completely when an instruction that wants
- * 0/1 consumes the result.  For example, when a nir 'bcsel'
- * consumes the result of 'feq'.  So we should be able to get by
- * without a boolean resolve step, and without incuring any
- * extra penalty in instruction count.
- */
-
-/* NIR bool -> native (adreno): */
-static struct ir3_instruction *
-ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
-{
-       return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
-}
-
-/* native (adreno) -> NIR bool: */
-static struct ir3_instruction *
-ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
-{
-       return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
-}
-
-/*
- * alu/sfu instructions:
- */
-
-static struct ir3_instruction *
-create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
-               unsigned src_bitsize, nir_op op)
-{
-       type_t src_type, dst_type;
-
-       switch (op) {
-       case nir_op_f2f32:
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-       case nir_op_f2i32:
-       case nir_op_f2i16:
-       case nir_op_f2i8:
-       case nir_op_f2u32:
-       case nir_op_f2u16:
-       case nir_op_f2u8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_F32;
-                       break;
-               case 16:
-                       src_type = TYPE_F16;
-                       break;
-               default:
-                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       case nir_op_i2f32:
-       case nir_op_i2f16:
-       case nir_op_i2i32:
-       case nir_op_i2i16:
-       case nir_op_i2i8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_S32;
-                       break;
-               case 16:
-                       src_type = TYPE_S16;
-                       break;
-               case 8:
-                       src_type = TYPE_S8;
-                       break;
-               default:
-                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       case nir_op_u2f32:
-       case nir_op_u2f16:
-       case nir_op_u2u32:
-       case nir_op_u2u16:
-       case nir_op_u2u8:
-               switch (src_bitsize) {
-               case 32:
-                       src_type = TYPE_U32;
-                       break;
-               case 16:
-                       src_type = TYPE_U16;
-                       break;
-               case 8:
-                       src_type = TYPE_U8;
-                       break;
-               default:
-                       compile_error(ctx, "invalid src bit size: %u", src_bitsize);
-               }
-               break;
-
-       default:
-               compile_error(ctx, "invalid conversion op: %u", op);
-       }
-
-       switch (op) {
-       case nir_op_f2f32:
-       case nir_op_i2f32:
-       case nir_op_u2f32:
-               dst_type = TYPE_F32;
-               break;
-
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-               /* TODO how to handle rounding mode? */
-       case nir_op_i2f16:
-       case nir_op_u2f16:
-               dst_type = TYPE_F16;
-               break;
-
-       case nir_op_f2i32:
-       case nir_op_i2i32:
-               dst_type = TYPE_S32;
-               break;
-
-       case nir_op_f2i16:
-       case nir_op_i2i16:
-               dst_type = TYPE_S16;
-               break;
-
-       case nir_op_f2i8:
-       case nir_op_i2i8:
-               dst_type = TYPE_S8;
-               break;
-
-       case nir_op_f2u32:
-       case nir_op_u2u32:
-               dst_type = TYPE_U32;
-               break;
-
-       case nir_op_f2u16:
-       case nir_op_u2u16:
-               dst_type = TYPE_U16;
-               break;
-
-       case nir_op_f2u8:
-       case nir_op_u2u8:
-               dst_type = TYPE_U8;
-               break;
-
-       default:
-               compile_error(ctx, "invalid conversion op: %u", op);
-       }
-
-       return ir3_COV(ctx->block, src, src_type, dst_type);
-}
-
-static void
-emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
-{
-       const nir_op_info *info = &nir_op_infos[alu->op];
-       struct ir3_instruction **dst, *src[info->num_inputs];
-       unsigned bs[info->num_inputs];     /* bit size */
-       struct ir3_block *b = ctx->block;
-       unsigned dst_sz, wrmask;
-
-       if (alu->dest.dest.is_ssa) {
-               dst_sz = alu->dest.dest.ssa.num_components;
-               wrmask = (1 << dst_sz) - 1;
-       } else {
-               dst_sz = alu->dest.dest.reg.reg->num_components;
-               wrmask = alu->dest.write_mask;
-       }
-
-       dst = get_dst(ctx, &alu->dest.dest, dst_sz);
-
-       /* Vectors are special in that they have non-scalarized writemasks,
-        * and just take the first swizzle channel for each argument in
-        * order into each writemask channel.
-        */
-       if ((alu->op == nir_op_vec2) ||
-                       (alu->op == nir_op_vec3) ||
-                       (alu->op == nir_op_vec4)) {
-
-               for (int i = 0; i < info->num_inputs; i++) {
-                       nir_alu_src *asrc = &alu->src[i];
-
-                       compile_assert(ctx, !asrc->abs);
-                       compile_assert(ctx, !asrc->negate);
-
-                       src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
-                       if (!src[i])
-                               src[i] = create_immed(ctx->block, 0);
-                       dst[i] = ir3_MOV(b, src[i], TYPE_U32);
-               }
-
-               put_dst(ctx, &alu->dest.dest);
-               return;
-       }
-
-       /* We also get mov's with more than one component for mov's so
-        * handle those specially:
-        */
-       if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) {
-               type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32;
-               nir_alu_src *asrc = &alu->src[0];
-               struct ir3_instruction *const *src0 = get_src(ctx, &asrc->src);
-
-               for (unsigned i = 0; i < dst_sz; i++) {
-                       if (wrmask & (1 << i)) {
-                               dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type);
-                       } else {
-                               dst[i] = NULL;
-                       }
-               }
-
-               put_dst(ctx, &alu->dest.dest);
-               return;
-       }
-
-       /* General case: We can just grab the one used channel per src. */
-       for (int i = 0; i < info->num_inputs; i++) {
-               unsigned chan = ffs(alu->dest.write_mask) - 1;
-               nir_alu_src *asrc = &alu->src[i];
-
-               compile_assert(ctx, !asrc->abs);
-               compile_assert(ctx, !asrc->negate);
-
-               src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
-               bs[i] = nir_src_bit_size(asrc->src);
-
-               compile_assert(ctx, src[i]);
-       }
-
-       switch (alu->op) {
-       case nir_op_f2f32:
-       case nir_op_f2f16_rtne:
-       case nir_op_f2f16_rtz:
-       case nir_op_f2f16:
-       case nir_op_f2i32:
-       case nir_op_f2i16:
-       case nir_op_f2i8:
-       case nir_op_f2u32:
-       case nir_op_f2u16:
-       case nir_op_f2u8:
-       case nir_op_i2f32:
-       case nir_op_i2f16:
-       case nir_op_i2i32:
-       case nir_op_i2i16:
-       case nir_op_i2i8:
-       case nir_op_u2f32:
-       case nir_op_u2f16:
-       case nir_op_u2u32:
-       case nir_op_u2u16:
-       case nir_op_u2u8:
-               dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
-               break;
-       case nir_op_f2b:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_b2f:
-               dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
-               break;
-       case nir_op_b2i:
-               dst[0] = ir3_b2n(b, src[0]);
-               break;
-       case nir_op_i2b:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-
-       case nir_op_fneg:
-               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
-               break;
-       case nir_op_fabs:
-               dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
-               break;
-       case nir_op_fmax:
-               dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fmin:
-               dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fsat:
-               /* if there is just a single use of the src, and it supports
-                * (sat) bit, we can just fold the (sat) flag back to the
-                * src instruction and create a mov.  This is easier for cp
-                * to eliminate.
-                *
-                * TODO probably opc_cat==4 is ok too
-                */
-               if (alu->src[0].src.is_ssa &&
-                               (list_length(&alu->src[0].src.ssa->uses) == 1) &&
-                               ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) {
-                       src[0]->flags |= IR3_INSTR_SAT;
-                       dst[0] = ir3_MOV(b, src[0], TYPE_U32);
-               } else {
-                       /* otherwise generate a max.f that saturates.. blob does
-                        * similar (generating a cat2 mov using max.f)
-                        */
-                       dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0);
-                       dst[0]->flags |= IR3_INSTR_SAT;
-               }
-               break;
-       case nir_op_fmul:
-               dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fadd:
-               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_fsub:
-               dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
-               break;
-       case nir_op_ffma:
-               dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
-               break;
-       case nir_op_fddx:
-               dst[0] = ir3_DSX(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-       case nir_op_fddy:
-               dst[0] = ir3_DSY(b, src[0], 0);
-               dst[0]->cat5.type = TYPE_F32;
-               break;
-               break;
-       case nir_op_flt:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_fge:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_feq:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_EQ;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_fne:
-               dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_fceil:
-               dst[0] = ir3_CEIL_F(b, src[0], 0);
-               break;
-       case nir_op_ffloor:
-               dst[0] = ir3_FLOOR_F(b, src[0], 0);
-               break;
-       case nir_op_ftrunc:
-               dst[0] = ir3_TRUNC_F(b, src[0], 0);
-               break;
-       case nir_op_fround_even:
-               dst[0] = ir3_RNDNE_F(b, src[0], 0);
-               break;
-       case nir_op_fsign:
-               dst[0] = ir3_SIGN_F(b, src[0], 0);
-               break;
-
-       case nir_op_fsin:
-               dst[0] = ir3_SIN(b, src[0], 0);
-               break;
-       case nir_op_fcos:
-               dst[0] = ir3_COS(b, src[0], 0);
-               break;
-       case nir_op_frsq:
-               dst[0] = ir3_RSQ(b, src[0], 0);
-               break;
-       case nir_op_frcp:
-               dst[0] = ir3_RCP(b, src[0], 0);
-               break;
-       case nir_op_flog2:
-               dst[0] = ir3_LOG2(b, src[0], 0);
-               break;
-       case nir_op_fexp2:
-               dst[0] = ir3_EXP2(b, src[0], 0);
-               break;
-       case nir_op_fsqrt:
-               dst[0] = ir3_SQRT(b, src[0], 0);
-               break;
-
-       case nir_op_iabs:
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
-               break;
-       case nir_op_iadd:
-               dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_iand:
-               dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imax:
-               dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_umax:
-               dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imin:
-               dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_umin:
-               dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_imul:
-               /*
-                * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
-                *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
-                *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
-                *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
-                */
-               dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
-                                       ir3_MADSH_M16(b, src[0], 0, src[1], 0,
-                                               ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
-               break;
-       case nir_op_ineg:
-               dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
-               break;
-       case nir_op_inot:
-               dst[0] = ir3_NOT_B(b, src[0], 0);
-               break;
-       case nir_op_ior:
-               dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ishl:
-               dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ishr:
-               dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_isign: {
-               /* maybe this would be sane to lower in nir.. */
-               struct ir3_instruction *neg, *pos;
-
-               neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-               neg->cat2.condition = IR3_COND_LT;
-
-               pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
-               pos->cat2.condition = IR3_COND_GT;
-
-               dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
-
-               break;
-       }
-       case nir_op_isub:
-               dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ixor:
-               dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ushr:
-               dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
-               break;
-       case nir_op_ilt:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_ige:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_ieq:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_EQ;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_ine:
-               dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_NE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_ult:
-               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_LT;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-       case nir_op_uge:
-               dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
-               dst[0]->cat2.condition = IR3_COND_GE;
-               dst[0] = ir3_n2b(b, dst[0]);
-               break;
-
-       case nir_op_bcsel: {
-               struct ir3_instruction *cond = ir3_b2n(b, src[0]);
-               compile_assert(ctx, bs[1] == bs[2]);
-               /* the boolean condition is 32b even if src[1] and src[2] are
-                * half-precision, but sel.b16 wants all three src's to be the
-                * same type.
-                */
-               if (bs[1] < 32)
-                       cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
-               dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
-               break;
-       }
-       case nir_op_bit_count:
-               dst[0] = ir3_CBITS_B(b, src[0], 0);
-               break;
-       case nir_op_ifind_msb: {
-               struct ir3_instruction *cmp;
-               dst[0] = ir3_CLZ_S(b, src[0], 0);
-               cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
-               cmp->cat2.condition = IR3_COND_GE;
-               dst[0] = ir3_SEL_B32(b,
-                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-                               cmp, 0, dst[0], 0);
-               break;
-       }
-       case nir_op_ufind_msb:
-               dst[0] = ir3_CLZ_B(b, src[0], 0);
-               dst[0] = ir3_SEL_B32(b,
-                               ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
-                               src[0], 0, dst[0], 0);
-               break;
-       case nir_op_find_lsb:
-               dst[0] = ir3_BFREV_B(b, src[0], 0);
-               dst[0] = ir3_CLZ_B(b, dst[0], 0);
-               break;
-       case nir_op_bitfield_reverse:
-               dst[0] = ir3_BFREV_B(b, src[0], 0);
-               break;
-
-       default:
-               compile_error(ctx, "Unhandled ALU op: %s\n",
-                               nir_op_infos[alu->op].name);
-               break;
-       }
-
-       put_dst(ctx, &alu->dest.dest);
-}
-
-/* handles direct/indirect UBO reads: */
-static void
-emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
-       nir_const_value *const_offset;
-       /* UBO addresses are the first driver params: */
-       unsigned ubo = regid(ctx->so->constbase.ubo, 0);
-       const unsigned ptrsz = pointer_size(ctx);
-
-       int off = 0;
-
-       /* First src is ubo index, which could either be an immed or not: */
-       src0 = get_src(ctx, &intr->src[0])[0];
-       if (is_same_type_mov(src0) &&
-                       (src0->regs[1]->flags & IR3_REG_IMMED)) {
-               base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
-               base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
-       } else {
-               base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0, 4));
-               base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0, 4));
-       }
-
-       /* note: on 32bit gpu's base_hi is ignored and DCE'd */
-       addr = base_lo;
-
-       const_offset = nir_src_as_const_value(intr->src[1]);
-       if (const_offset) {
-               off += const_offset->u32[0];
-       } else {
-               /* For load_ubo_indirect, second src is indirect offset: */
-               src1 = get_src(ctx, &intr->src[1])[0];
-
-               /* and add offset to addr: */
-               addr = ir3_ADD_S(b, addr, 0, src1, 0);
-       }
-
-       /* if offset is to large to encode in the ldg, split it out: */
-       if ((off + (intr->num_components * 4)) > 1024) {
-               /* split out the minimal amount to improve the odds that
-                * cp can fit the immediate in the add.s instruction:
-                */
-               unsigned off2 = off + (intr->num_components * 4) - 1024;
-               addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
-               off -= off2;
-       }
-
-       if (ptrsz == 2) {
-               struct ir3_instruction *carry;
-
-               /* handle 32b rollover, ie:
-                *   if (addr < base_lo)
-                *      base_hi++
-                */
-               carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
-               carry->cat2.condition = IR3_COND_LT;
-               base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
-
-               addr = create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2);
-       }
-
-       for (int i = 0; i < intr->num_components; i++) {
-               struct ir3_instruction *load =
-                               ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
-               load->cat6.type = TYPE_U32;
-               load->cat6.src_offset = off + i * 4;     /* byte offset */
-               dst[i] = load;
-       }
-}
-
-/* src[] = { buffer_index, offset }. No const_index */
-static void
-emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldgb, *src0, *src1, *offset;
-       nir_const_value *const_offset;
-
-       /* can this be non-const buffer_index?  how do we handle that? */
-       const_offset = nir_src_as_const_value(intr->src[0]);
-       compile_assert(ctx, const_offset);
-
-       offset = get_src(ctx, &intr->src[1])[0];
-
-       /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
-       src0 = create_collect(ctx, (struct ir3_instruction*[]){
-               offset,
-               create_immed(b, 0),
-       }, 2);
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-
-       ldgb = ir3_LDGB(b, create_immed(b, const_offset->u32[0]), 0,
-                       src0, 0, src1, 0);
-       ldgb->regs[0]->wrmask = MASK(intr->num_components);
-       ldgb->cat6.iim_val = intr->num_components;
-       ldgb->cat6.d = 4;
-       ldgb->cat6.type = TYPE_U32;
-       ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
-       ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
-
-       split_dest(b, dst, ldgb, 0, intr->num_components);
-}
-
-/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
-static void
-emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stgb, *src0, *src1, *src2, *offset;
-       nir_const_value *const_offset;
-       /* TODO handle wrmask properly, see _store_shared().. but I think
-        * it is more a PITA than that, since blob ends up loading the
-        * masked components and writing them back out.
-        */
-       unsigned wrmask = intr->const_index[0];
-       unsigned ncomp = ffs(~wrmask) - 1;
-
-       /* can this be non-const buffer_index?  how do we handle that? */
-       const_offset = nir_src_as_const_value(intr->src[1]);
-       compile_assert(ctx, const_offset);
-
-       offset = get_src(ctx, &intr->src[2])[0];
-
-       /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
-        * nir already *= 4:
-        */
-       src0 = create_collect(ctx, get_src(ctx, &intr->src[0]), ncomp);
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-       src2 = create_collect(ctx, (struct ir3_instruction*[]){
-               offset,
-               create_immed(b, 0),
-       }, 2);
-
-       stgb = ir3_STGB(b, create_immed(b, const_offset->u32[0]), 0,
-                       src0, 0, src1, 0, src2, 0);
-       stgb->cat6.iim_val = ncomp;
-       stgb->cat6.d = 4;
-       stgb->cat6.type = TYPE_U32;
-       stgb->barrier_class = IR3_BARRIER_BUFFER_W;
-       stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-
-       array_insert(b, b->keeps, stgb);
-}
-
-/* src[] = { block_index } */
-static void
-emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       /* SSBO size stored as a const starting at ssbo_sizes: */
-       unsigned blk_idx = nir_src_as_const_value(intr->src[0])->u32[0];
-       unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) +
-               ctx->so->const_layout.ssbo_size.off[blk_idx];
-
-       debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx));
-
-       dst[0] = create_uniform(ctx, idx);
-}
-
-/*
- * SSBO atomic intrinsics
- *
- * All of the SSBO atomic memory operations read a value from memory,
- * compute a new value using one of the operations below, write the new
- * value to memory, and return the original value read.
- *
- * All operations take 3 sources except CompSwap that takes 4. These
- * sources represent:
- *
- * 0: The SSBO buffer index.
- * 1: The offset into the SSBO buffer of the variable that the atomic
- *    operation will operate on.
- * 2: The data parameter to the atomic function (i.e. the value to add
- *    in ssbo_atomic_add, etc).
- * 3: For CompSwap only: the second data parameter.
- */
-static struct ir3_instruction *
-emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *offset;
-       nir_const_value *const_offset;
-       type_t type = TYPE_U32;
-
-       /* can this be non-const buffer_index?  how do we handle that? */
-       const_offset = nir_src_as_const_value(intr->src[0]);
-       compile_assert(ctx, const_offset);
-       ssbo = create_immed(b, const_offset->u32[0]);
-
-       offset = get_src(ctx, &intr->src[1])[0];
-
-       /* src0 is data (or uvec2(data, compare))
-        * src1 is offset
-        * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
-        *
-        * Note that nir already multiplies the offset by four
-        */
-       src0 = get_src(ctx, &intr->src[2])[0];
-       src1 = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-       src2 = create_collect(ctx, (struct ir3_instruction*[]){
-               offset,
-               create_immed(b, 0),
-       }, 2);
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_ssbo_atomic_add:
-               atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imin:
-               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umin:
-               atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_imax:
-               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_ssbo_atomic_umax:
-               atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_and:
-               atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_or:
-               atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_xor:
-               atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_ssbo_atomic_comp_swap:
-               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-               src0 = create_collect(ctx, (struct ir3_instruction*[]){
-                       get_src(ctx, &intr->src[3])[0],
-                       src0,
-               }, 2);
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = 4;
-       atomic->cat6.type = type;
-       atomic->barrier_class = IR3_BARRIER_BUFFER_W;
-       atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
-}
-
-/* src[] = { offset }. const_index[] = { base } */
-static void
-emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *ldl, *offset;
-       unsigned base;
-
-       offset = get_src(ctx, &intr->src[0])[0];
-       base   = nir_intrinsic_base(intr);
-
-       ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
-       ldl->cat6.src_offset = base;
-       ldl->cat6.type = utype_dst(intr->dest);
-       ldl->regs[0]->wrmask = MASK(intr->num_components);
-
-       ldl->barrier_class = IR3_BARRIER_SHARED_R;
-       ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
-
-       split_dest(b, dst, ldl, 0, intr->num_components);
-}
-
-/* src[] = { value, offset }. const_index[] = { base, write_mask } */
-static void
-emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *stl, *offset;
-       struct ir3_instruction * const *value;
-       unsigned base, wrmask;
-
-       value  = get_src(ctx, &intr->src[0]);
-       offset = get_src(ctx, &intr->src[1])[0];
-
-       base   = nir_intrinsic_base(intr);
-       wrmask = nir_intrinsic_write_mask(intr);
-
-       /* Combine groups of consecutive enabled channels in one write
-        * message. We use ffs to find the first enabled channel and then ffs on
-        * the bit-inverse, down-shifted writemask to determine the length of
-        * the block of enabled bits.
-        *
-        * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
-        */
-       while (wrmask) {
-               unsigned first_component = ffs(wrmask) - 1;
-               unsigned length = ffs(~(wrmask >> first_component)) - 1;
-
-               stl = ir3_STL(b, offset, 0,
-                       create_collect(ctx, &value[first_component], length), 0,
-                       create_immed(b, length), 0);
-               stl->cat6.dst_offset = first_component + base;
-               stl->cat6.type = utype_src(intr->src[0]);
-               stl->barrier_class = IR3_BARRIER_SHARED_W;
-               stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-               array_insert(b, b->keeps, stl);
-
-               /* Clear the bits in the writemask that we just wrote, then try
-                * again to see if more channels are left.
-                */
-               wrmask &= (15 << (first_component + length));
-       }
-}
-
-/*
- * CS shared variable atomic intrinsics
- *
- * All of the shared variable atomic memory operations read a value from
- * memory, compute a new value using one of the operations below, write the
- * new value to memory, and return the original value read.
- *
- * All operations take 2 sources except CompSwap that takes 3. These
- * sources represent:
- *
- * 0: The offset into the shared variable storage region that the atomic
- *    operation will operate on.
- * 1: The data parameter to the atomic function (i.e. the value to add
- *    in shared_atomic_add, etc).
- * 2: For CompSwap only: the second data parameter.
- */
-static struct ir3_instruction *
-emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *atomic, *src0, *src1;
-       type_t type = TYPE_U32;
-
-       src0 = get_src(ctx, &intr->src[0])[0];   /* offset */
-       src1 = get_src(ctx, &intr->src[1])[0];   /* value */
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_shared_atomic_add:
-               atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_imin:
-               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_shared_atomic_umin:
-               atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_imax:
-               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-               type = TYPE_S32;
-               break;
-       case nir_intrinsic_shared_atomic_umax:
-               atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_and:
-               atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_or:
-               atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_xor:
-               atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
-               break;
-       case nir_intrinsic_shared_atomic_comp_swap:
-               /* for cmpxchg, src1 is [ui]vec2(data, compare): */
-               src1 = create_collect(ctx, (struct ir3_instruction*[]){
-                       get_src(ctx, &intr->src[2])[0],
-                       src1,
-               }, 2);
-               atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = 1;
-       atomic->cat6.type = type;
-       atomic->barrier_class = IR3_BARRIER_SHARED_W;
-       atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
-}
-
-/* Images get mapped into SSBO/image state (for store/atomic) and texture
- * state block (for load).  To simplify things, invert the image id and
- * map it from end of state block, ie. image 0 becomes num-1, image 1
- * becomes num-2, etc.  This potentially avoids needing to re-emit texture
- * state when switching shaders.
- *
- * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
- * coded.  Also, since all the gl shader stages (ie. everything but CS)
- * share the same SSBO/image state block, this might require some more
- * logic if we supported images in anything other than FS..
- */
-static unsigned
-get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
-{
-       unsigned int loc = 0;
-       unsigned inner_size = 1;
-
-       while (deref->deref_type != nir_deref_type_var) {
-               assert(deref->deref_type == nir_deref_type_array);
-               nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
-               assert(const_index);
-
-               /* Go to the next instruction */
-               deref = nir_deref_instr_parent(deref);
-
-               assert(glsl_type_is_array(deref->type));
-               const unsigned array_len = glsl_get_length(deref->type);
-               loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
-
-               /* Update the inner size */
-               inner_size *= array_len;
-       }
-
-       loc += deref->var->data.driver_location;
-
-       /* TODO figure out real limit per generation, and don't hardcode: */
-       const unsigned max_samplers = 16;
-       return max_samplers - loc - 1;
-}
-
-/* see tex_info() for equiv logic for texture instructions.. it would be
- * nice if this could be better unified..
- */
-static unsigned
-get_image_coords(const nir_variable *var, unsigned *flagsp)
-{
-       const struct glsl_type *type = glsl_without_array(var->type);
-       unsigned coords, flags = 0;
-
-       switch (glsl_get_sampler_dim(type)) {
-       case GLSL_SAMPLER_DIM_1D:
-       case GLSL_SAMPLER_DIM_BUF:
-               coords = 1;
-               break;
-       case GLSL_SAMPLER_DIM_2D:
-       case GLSL_SAMPLER_DIM_RECT:
-       case GLSL_SAMPLER_DIM_EXTERNAL:
-       case GLSL_SAMPLER_DIM_MS:
-               coords = 2;
-               break;
-       case GLSL_SAMPLER_DIM_3D:
-       case GLSL_SAMPLER_DIM_CUBE:
-               flags |= IR3_INSTR_3D;
-               coords = 3;
-               break;
-       default:
-               unreachable("bad sampler dim");
-               return 0;
-       }
-
-       if (glsl_sampler_type_is_array(type)) {
-               /* note: unlike tex_info(), adjust # of coords to include array idx: */
-               coords++;
-               flags |= IR3_INSTR_A;
-       }
-
-       if (flagsp)
-               *flagsp = flags;
-
-       return coords;
-}
-
-static type_t
-get_image_type(const nir_variable *var)
-{
-       switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
-       case GLSL_TYPE_UINT:
-               return TYPE_U32;
-       case GLSL_TYPE_INT:
-               return TYPE_S32;
-       case GLSL_TYPE_FLOAT:
-               return TYPE_F32;
-       default:
-               unreachable("bad sampler type.");
-               return 0;
-       }
-}
-
-static struct ir3_instruction *
-get_image_offset(struct ir3_context *ctx, const nir_variable *var,
-               struct ir3_instruction * const *coords, bool byteoff)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *offset;
-       unsigned ncoords = get_image_coords(var, NULL);
-
-       /* to calculate the byte offset (yes, uggg) we need (up to) three
-        * const values to know the bytes per pixel, and y and z stride:
-        */
-       unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
-               ctx->so->const_layout.image_dims.off[var->data.driver_location];
-
-       debug_assert(ctx->so->const_layout.image_dims.mask &
-                       (1 << var->data.driver_location));
-
-       /* offset = coords.x * bytes_per_pixel: */
-       offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
-       if (ncoords > 1) {
-               /* offset += coords.y * y_pitch: */
-               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
-                               coords[1], 0, offset, 0);
-       }
-       if (ncoords > 2) {
-               /* offset += coords.z * z_pitch: */
-               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
-                               coords[2], 0, offset, 0);
-       }
-
-       if (!byteoff) {
-               /* Some cases, like atomics, seem to use dword offset instead
-                * of byte offsets.. blob just puts an extra shr.b in there
-                * in those cases:
-                */
-               offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-       }
-
-       return create_collect(ctx, (struct ir3_instruction*[]){
-               offset,
-               create_immed(b, 0),
-       }, 2);
-}
-
-/* src[] = { deref, coord, sample_index }. const_index[] = {} */
-static void
-emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       struct ir3_block *b = ctx->block;
-       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-       struct ir3_instruction *sam;
-       struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
-       struct ir3_instruction *coords[4];
-       unsigned flags, ncoords = get_image_coords(var, &flags);
-       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-       type_t type = get_image_type(var);
-
-       /* hmm, this seems a bit odd, but it is what blob does and (at least
-        * a5xx) just faults on bogus addresses otherwise:
-        */
-       if (flags & IR3_INSTR_3D) {
-               flags &= ~IR3_INSTR_3D;
-               flags |= IR3_INSTR_A;
-       }
-
-       for (unsigned i = 0; i < ncoords; i++)
-               coords[i] = src0[i];
-
-       if (ncoords == 1)
-               coords[ncoords++] = create_immed(b, 0);
-
-       sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags,
-                       tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
-
-       sam->barrier_class = IR3_BARRIER_IMAGE_R;
-       sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
-
-       split_dest(b, dst, sam, 0, 4);
-}
-
-/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
-static void
-emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-       struct ir3_instruction *stib, *offset;
-       struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
-       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
-       unsigned ncoords = get_image_coords(var, NULL);
-       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-
-       /* src0 is value
-        * src1 is coords
-        * src2 is 64b byte offset
-        */
-
-       offset = get_image_offset(ctx, var, coords, true);
-
-       /* NOTE: stib seems to take byte offset, but stgb.typed can be used
-        * too and takes a dword offset.. not quite sure yet why blob uses
-        * one over the other in various cases.
-        */
-
-       stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
-                       create_collect(ctx, value, 4), 0,
-                       create_collect(ctx, coords, ncoords), 0,
-                       offset, 0);
-       stib->cat6.iim_val = 4;
-       stib->cat6.d = ncoords;
-       stib->cat6.type = get_image_type(var);
-       stib->cat6.typed = true;
-       stib->barrier_class = IR3_BARRIER_IMAGE_W;
-       stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-       array_insert(b, b->keeps, stib);
-}
-
-static void
-emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
-               struct ir3_instruction **dst)
-{
-       struct ir3_block *b = ctx->block;
-       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-       unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
-       struct ir3_instruction *sam, *lod;
-       unsigned flags, ncoords = get_image_coords(var, &flags);
-
-       lod = create_immed(b, 0);
-       sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
-                       tex_idx, tex_idx, lod, NULL);
-
-       /* Array size actually ends up in .w rather than .z. This doesn't
-        * matter for miplevel 0, but for higher mips the value in z is
-        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-        * returned, which means that we have to add 1 to it for arrays for
-        * a3xx.
-        *
-        * Note use a temporary dst and then copy, since the size of the dst
-        * array that is passed in is based on nir's understanding of the
-        * result size, not the hardware's
-        */
-       struct ir3_instruction *tmp[4];
-
-       split_dest(b, tmp, sam, 0, 4);
-
-       /* get_size instruction returns size in bytes instead of texels
-        * for imageBuffer, so we need to divide it by the pixel size
-        * of the image format.
-        *
-        * TODO: This is at least true on a5xx. Check other gens.
-        */
-       enum glsl_sampler_dim dim =
-               glsl_get_sampler_dim(glsl_without_array(var->type));
-       if (dim == GLSL_SAMPLER_DIM_BUF) {
-               /* Since all the possible values the divisor can take are
-                * power-of-two (4, 8, or 16), the division is implemented
-                * as a shift-right.
-                * During shader setup, the log2 of the image format's
-                * bytes-per-pixel should have been emitted in 2nd slot of
-                * image_dims. See ir3_shader::emit_image_dims().
-                */
-               unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
-                       ctx->so->const_layout.image_dims.off[var->data.driver_location];
-               struct ir3_instruction *aux = create_uniform(ctx, cb + 1);
-
-               tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0);
-       }
-
-       for (unsigned i = 0; i < ncoords; i++)
-               dst[i] = tmp[i];
-
-       if (flags & IR3_INSTR_A) {
-               if (ctx->compiler->levels_add_one) {
-                       dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
-               } else {
-                       dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
-               }
-       }
-}
-
-/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
-static struct ir3_instruction *
-emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       const nir_variable *var = nir_intrinsic_get_var(intr, 0);
-       struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
-       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
-       unsigned ncoords = get_image_coords(var, NULL);
-
-       image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
-
-       /* src0 is value (or uvec2(value, compare))
-        * src1 is coords
-        * src2 is 64b byte offset
-        */
-       src0 = get_src(ctx, &intr->src[3])[0];
-       src1 = create_collect(ctx, coords, ncoords);
-       src2 = get_image_offset(ctx, var, coords, false);
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_image_deref_atomic_add:
-               atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_min:
-               atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_max:
-               atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_and:
-               atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_or:
-               atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_xor:
-               atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_exchange:
-               atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       case nir_intrinsic_image_deref_atomic_comp_swap:
-               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-               src0 = create_collect(ctx, (struct ir3_instruction*[]){
-                       get_src(ctx, &intr->src[4])[0],
-                       src0,
-               }, 2);
-               atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       atomic->cat6.iim_val = 1;
-       atomic->cat6.d = ncoords;
-       atomic->cat6.type = get_image_type(var);
-       atomic->cat6.typed = true;
-       atomic->barrier_class = IR3_BARRIER_IMAGE_W;
-       atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
-
-       /* even if nothing consume the result, we can't DCE the instruction: */
-       array_insert(b, b->keeps, atomic);
-
-       return atomic;
-}
-
-static void
-emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction *barrier;
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_barrier:
-               barrier = ir3_BAR(b);
-               barrier->cat7.g = true;
-               barrier->cat7.l = true;
-               barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
-               barrier->barrier_class = IR3_BARRIER_EVERYTHING;
-               break;
-       case nir_intrinsic_memory_barrier:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_W;
-               barrier->barrier_conflict =
-                               IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-               break;
-       case nir_intrinsic_memory_barrier_atomic_counter:
-       case nir_intrinsic_memory_barrier_buffer:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_BUFFER_W;
-               barrier->barrier_conflict = IR3_BARRIER_BUFFER_R |
-                               IR3_BARRIER_BUFFER_W;
-               break;
-       case nir_intrinsic_memory_barrier_image:
-               // TODO double check if this should have .g set
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_IMAGE_W;
-               barrier->barrier_conflict = IR3_BARRIER_IMAGE_R |
-                               IR3_BARRIER_IMAGE_W;
-               break;
-       case nir_intrinsic_memory_barrier_shared:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_SHARED_W;
-               barrier->barrier_conflict = IR3_BARRIER_SHARED_R |
-                               IR3_BARRIER_SHARED_W;
-               break;
-       case nir_intrinsic_group_memory_barrier:
-               barrier = ir3_FENCE(b);
-               barrier->cat7.g = true;
-               barrier->cat7.l = true;
-               barrier->cat7.r = true;
-               barrier->cat7.w = true;
-               barrier->barrier_class = IR3_BARRIER_SHARED_W |
-                               IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_W;
-               barrier->barrier_conflict =
-                               IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W |
-                               IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W |
-                               IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
-               break;
-       default:
-               unreachable("boo");
-       }
-
-       /* make sure barrier doesn't get DCE'd */
-       array_insert(b, b->keeps, barrier);
-}
-
-static void add_sysval_input_compmask(struct ir3_context *ctx,
-               gl_system_value slot, unsigned compmask,
-               struct ir3_instruction *instr)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       unsigned r = regid(so->inputs_count, 0);
-       unsigned n = so->inputs_count++;
-
-       so->inputs[n].sysval = true;
-       so->inputs[n].slot = slot;
-       so->inputs[n].compmask = compmask;
-       so->inputs[n].regid = r;
-       so->inputs[n].interpolate = INTERP_MODE_FLAT;
-       so->total_in++;
-
-       ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
-       ctx->ir->inputs[r] = instr;
-}
-
-static void add_sysval_input(struct ir3_context *ctx, gl_system_value slot,
-               struct ir3_instruction *instr)
-{
-       add_sysval_input_compmask(ctx, slot, 0x1, instr);
-}
-
-static void
-emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
-{
-       const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
-       struct ir3_instruction **dst;
-       struct ir3_instruction * const *src;
-       struct ir3_block *b = ctx->block;
-       nir_const_value *const_offset;
-       int idx, comp;
-
-       if (info->has_dest) {
-               unsigned n = nir_intrinsic_dest_components(intr);
-               dst = get_dst(ctx, &intr->dest, n);
-       } else {
-               dst = NULL;
-       }
-
-       switch (intr->intrinsic) {
-       case nir_intrinsic_load_uniform:
-               idx = nir_intrinsic_base(intr);
-               const_offset = nir_src_as_const_value(intr->src[0]);
-               if (const_offset) {
-                       idx += const_offset->u32[0];
-                       for (int i = 0; i < intr->num_components; i++) {
-                               unsigned n = idx * 4 + i;
-                               dst[i] = create_uniform(ctx, n);
-                       }
-               } else {
-                       src = get_src(ctx, &intr->src[0]);
-                       for (int i = 0; i < intr->num_components; i++) {
-                               int n = idx * 4 + i;
-                               dst[i] = create_uniform_indirect(ctx, n,
-                                               get_addr(ctx, src[0], 4));
-                       }
-                       /* NOTE: if relative addressing is used, we set
-                        * constlen in the compiler (to worst-case value)
-                        * since we don't know in the assembler what the max
-                        * addr reg value can be:
-                        */
-                       ctx->so->constlen = ctx->s->num_uniforms;
-               }
-               break;
-       case nir_intrinsic_load_ubo:
-               emit_intrinsic_load_ubo(ctx, intr, dst);
-               break;
-       case nir_intrinsic_load_input:
-               idx = nir_intrinsic_base(intr);
-               comp = nir_intrinsic_component(intr);
-               const_offset = nir_src_as_const_value(intr->src[0]);
-               if (const_offset) {
-                       idx += const_offset->u32[0];
-                       for (int i = 0; i < intr->num_components; i++) {
-                               unsigned n = idx * 4 + i + comp;
-                               dst[i] = ctx->ir->inputs[n];
-                       }
-               } else {
-                       src = get_src(ctx, &intr->src[0]);
-                       struct ir3_instruction *collect =
-                                       create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs);
-                       struct ir3_instruction *addr = get_addr(ctx, src[0], 4);
-                       for (int i = 0; i < intr->num_components; i++) {
-                               unsigned n = idx * 4 + i + comp;
-                               dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
-                                               n, addr, collect);
-                       }
-               }
-               break;
-       case nir_intrinsic_load_ssbo:
-               emit_intrinsic_load_ssbo(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_ssbo:
-               emit_intrinsic_store_ssbo(ctx, intr);
-               break;
-       case nir_intrinsic_get_buffer_size:
-               emit_intrinsic_ssbo_size(ctx, intr, dst);
-               break;
-       case nir_intrinsic_ssbo_atomic_add:
-       case nir_intrinsic_ssbo_atomic_imin:
-       case nir_intrinsic_ssbo_atomic_umin:
-       case nir_intrinsic_ssbo_atomic_imax:
-       case nir_intrinsic_ssbo_atomic_umax:
-       case nir_intrinsic_ssbo_atomic_and:
-       case nir_intrinsic_ssbo_atomic_or:
-       case nir_intrinsic_ssbo_atomic_xor:
-       case nir_intrinsic_ssbo_atomic_exchange:
-       case nir_intrinsic_ssbo_atomic_comp_swap:
-               dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
-               break;
-       case nir_intrinsic_load_shared:
-               emit_intrinsic_load_shared(ctx, intr, dst);
-               break;
-       case nir_intrinsic_store_shared:
-               emit_intrinsic_store_shared(ctx, intr);
-               break;
-       case nir_intrinsic_shared_atomic_add:
-       case nir_intrinsic_shared_atomic_imin:
-       case nir_intrinsic_shared_atomic_umin:
-       case nir_intrinsic_shared_atomic_imax:
-       case nir_intrinsic_shared_atomic_umax:
-       case nir_intrinsic_shared_atomic_and:
-       case nir_intrinsic_shared_atomic_or:
-       case nir_intrinsic_shared_atomic_xor:
-       case nir_intrinsic_shared_atomic_exchange:
-       case nir_intrinsic_shared_atomic_comp_swap:
-               dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
-               break;
-       case nir_intrinsic_image_deref_load:
-               emit_intrinsic_load_image(ctx, intr, dst);
-               break;
-       case nir_intrinsic_image_deref_store:
-               emit_intrinsic_store_image(ctx, intr);
-               break;
-       case nir_intrinsic_image_deref_size:
-               emit_intrinsic_image_size(ctx, intr, dst);
-               break;
-       case nir_intrinsic_image_deref_atomic_add:
-       case nir_intrinsic_image_deref_atomic_min:
-       case nir_intrinsic_image_deref_atomic_max:
-       case nir_intrinsic_image_deref_atomic_and:
-       case nir_intrinsic_image_deref_atomic_or:
-       case nir_intrinsic_image_deref_atomic_xor:
-       case nir_intrinsic_image_deref_atomic_exchange:
-       case nir_intrinsic_image_deref_atomic_comp_swap:
-               dst[0] = emit_intrinsic_atomic_image(ctx, intr);
-               break;
-       case nir_intrinsic_barrier:
-       case nir_intrinsic_memory_barrier:
-       case nir_intrinsic_group_memory_barrier:
-       case nir_intrinsic_memory_barrier_atomic_counter:
-       case nir_intrinsic_memory_barrier_buffer:
-       case nir_intrinsic_memory_barrier_image:
-       case nir_intrinsic_memory_barrier_shared:
-               emit_intrinsic_barrier(ctx, intr);
-               /* note that blk ptr no longer valid, make that obvious: */
-               b = NULL;
-               break;
-       case nir_intrinsic_store_output:
-               idx = nir_intrinsic_base(intr);
-               comp = nir_intrinsic_component(intr);
-               const_offset = nir_src_as_const_value(intr->src[1]);
-               compile_assert(ctx, const_offset != NULL);
-               idx += const_offset->u32[0];
-
-               src = get_src(ctx, &intr->src[0]);
-               for (int i = 0; i < intr->num_components; i++) {
-                       unsigned n = idx * 4 + i + comp;
-                       ctx->ir->outputs[n] = src[i];
-               }
-               break;
-       case nir_intrinsic_load_base_vertex:
-       case nir_intrinsic_load_first_vertex:
-               if (!ctx->basevertex) {
-                       ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
-                       add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex);
-               }
-               dst[0] = ctx->basevertex;
-               break;
-       case nir_intrinsic_load_vertex_id_zero_base:
-       case nir_intrinsic_load_vertex_id:
-               if (!ctx->vertex_id) {
-                       gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
-                               SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
-                       ctx->vertex_id = create_input(ctx, 0);
-                       add_sysval_input(ctx, sv, ctx->vertex_id);
-               }
-               dst[0] = ctx->vertex_id;
-               break;
-       case nir_intrinsic_load_instance_id:
-               if (!ctx->instance_id) {
-                       ctx->instance_id = create_input(ctx, 0);
-                       add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
-                                       ctx->instance_id);
-               }
-               dst[0] = ctx->instance_id;
-               break;
-       case nir_intrinsic_load_sample_id:
-       case nir_intrinsic_load_sample_id_no_per_sample:
-               if (!ctx->samp_id) {
-                       ctx->samp_id = create_input(ctx, 0);
-                       ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
-                       add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
-                                       ctx->samp_id);
-               }
-               dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
-               break;
-       case nir_intrinsic_load_sample_mask_in:
-               if (!ctx->samp_mask_in) {
-                       ctx->samp_mask_in = create_input(ctx, 0);
-                       add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
-                                       ctx->samp_mask_in);
-               }
-               dst[0] = ctx->samp_mask_in;
-               break;
-       case nir_intrinsic_load_user_clip_plane:
-               idx = nir_intrinsic_ucp_id(intr);
-               for (int i = 0; i < intr->num_components; i++) {
-                       unsigned n = idx * 4 + i;
-                       dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
-               }
-               break;
-       case nir_intrinsic_load_front_face:
-               if (!ctx->frag_face) {
-                       ctx->so->frag_face = true;
-                       ctx->frag_face = create_input(ctx, 0);
-                       add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face);
-                       ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
-               }
-               /* for fragface, we get -1 for back and 0 for front. However this is
-                * the inverse of what nir expects (where ~0 is true).
-                */
-               dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
-               dst[0] = ir3_NOT_B(b, dst[0], 0);
-               break;
-       case nir_intrinsic_load_local_invocation_id:
-               if (!ctx->local_invocation_id) {
-                       ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7);
-                       add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID,
-                                       0x7, ctx->local_invocation_id);
-               }
-               split_dest(b, dst, ctx->local_invocation_id, 0, 3);
-               break;
-       case nir_intrinsic_load_work_group_id:
-               if (!ctx->work_group_id) {
-                       ctx->work_group_id = create_input_compmask(ctx, 0, 0x7);
-                       add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID,
-                                       0x7, ctx->work_group_id);
-                       ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH;
-               }
-               split_dest(b, dst, ctx->work_group_id, 0, 3);
-               break;
-       case nir_intrinsic_load_num_work_groups:
-               for (int i = 0; i < intr->num_components; i++) {
-                       dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
-               }
-               break;
-       case nir_intrinsic_load_local_group_size:
-               for (int i = 0; i < intr->num_components; i++) {
-                       dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
-               }
-               break;
-       case nir_intrinsic_discard_if:
-       case nir_intrinsic_discard: {
-               struct ir3_instruction *cond, *kill;
-
-               if (intr->intrinsic == nir_intrinsic_discard_if) {
-                       /* conditional discard: */
-                       src = get_src(ctx, &intr->src[0]);
-                       cond = ir3_b2n(b, src[0]);
-               } else {
-                       /* unconditional discard: */
-                       cond = create_immed(b, 1);
-               }
-
-               /* NOTE: only cmps.*.* can write p0.x: */
-               cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
-               cond->cat2.condition = IR3_COND_NE;
-
-               /* condition always goes in predicate register: */
-               cond->regs[0]->num = regid(REG_P0, 0);
-
-               kill = ir3_KILL(b, cond, 0);
-               array_insert(ctx->ir, ctx->ir->predicates, kill);
-
-               array_insert(b, b->keeps, kill);
-               ctx->so->has_kill = true;
-
-               break;
-       }
-       default:
-               compile_error(ctx, "Unhandled intrinsic type: %s\n",
-                               nir_intrinsic_infos[intr->intrinsic].name);
-               break;
-       }
-
-       if (info->has_dest)
-               put_dst(ctx, &intr->dest);
-}
-
-static void
-emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
-{
-       struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
-                       instr->def.num_components);
-       type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
-
-       for (int i = 0; i < instr->def.num_components; i++)
-               dst[i] = create_immed_typed(ctx->block, instr->value.u32[i], type);
-}
-
-static void
-emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
-{
-       struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
-                       undef->def.num_components);
-       type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32;
-
-       /* backend doesn't want undefined instructions, so just plug
-        * in 0.0..
-        */
-       for (int i = 0; i < undef->def.num_components; i++)
-               dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
-}
-
-/*
- * texture fetch/sample instructions:
- */
-
-static void
-tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
-{
-       unsigned coords, flags = 0;
-
-       /* note: would use tex->coord_components.. except txs.. also,
-        * since array index goes after shadow ref, we don't want to
-        * count it:
-        */
-       switch (tex->sampler_dim) {
-       case GLSL_SAMPLER_DIM_1D:
-       case GLSL_SAMPLER_DIM_BUF:
-               coords = 1;
-               break;
-       case GLSL_SAMPLER_DIM_2D:
-       case GLSL_SAMPLER_DIM_RECT:
-       case GLSL_SAMPLER_DIM_EXTERNAL:
-       case GLSL_SAMPLER_DIM_MS:
-               coords = 2;
-               break;
-       case GLSL_SAMPLER_DIM_3D:
-       case GLSL_SAMPLER_DIM_CUBE:
-               coords = 3;
-               flags |= IR3_INSTR_3D;
-               break;
-       default:
-               unreachable("bad sampler_dim");
-       }
-
-       if (tex->is_shadow && tex->op != nir_texop_lod)
-               flags |= IR3_INSTR_S;
-
-       if (tex->is_array && tex->op != nir_texop_lod)
-               flags |= IR3_INSTR_A;
-
-       *flagsp = flags;
-       *coordsp = coords;
-}
-
-static void
-emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
-       struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
-       struct ir3_instruction *lod, *compare, *proj, *sample_index;
-       bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
-       unsigned i, coords, flags;
-       unsigned nsrc0 = 0, nsrc1 = 0;
-       type_t type;
-       opc_t opc = 0;
-
-       coord = off = ddx = ddy = NULL;
-       lod = proj = compare = sample_index = NULL;
-
-       /* TODO: might just be one component for gathers? */
-       dst = get_dst(ctx, &tex->dest, 4);
-
-       for (unsigned i = 0; i < tex->num_srcs; i++) {
-               switch (tex->src[i].src_type) {
-               case nir_tex_src_coord:
-                       coord = get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_bias:
-                       lod = get_src(ctx, &tex->src[i].src)[0];
-                       has_bias = true;
-                       break;
-               case nir_tex_src_lod:
-                       lod = get_src(ctx, &tex->src[i].src)[0];
-                       has_lod = true;
-                       break;
-               case nir_tex_src_comparator: /* shadow comparator */
-                       compare = get_src(ctx, &tex->src[i].src)[0];
-                       break;
-               case nir_tex_src_projector:
-                       proj = get_src(ctx, &tex->src[i].src)[0];
-                       has_proj = true;
-                       break;
-               case nir_tex_src_offset:
-                       off = get_src(ctx, &tex->src[i].src);
-                       has_off = true;
-                       break;
-               case nir_tex_src_ddx:
-                       ddx = get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_ddy:
-                       ddy = get_src(ctx, &tex->src[i].src);
-                       break;
-               case nir_tex_src_ms_index:
-                       sample_index = get_src(ctx, &tex->src[i].src)[0];
-                       break;
-               default:
-                       compile_error(ctx, "Unhandled NIR tex src type: %d\n",
-                                       tex->src[i].src_type);
-                       return;
-               }
-       }
-
-       switch (tex->op) {
-       case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
-       case nir_texop_txb:      opc = OPC_SAMB;     break;
-       case nir_texop_txl:      opc = OPC_SAML;     break;
-       case nir_texop_txd:      opc = OPC_SAMGQ;    break;
-       case nir_texop_txf:      opc = OPC_ISAML;    break;
-       case nir_texop_lod:      opc = OPC_GETLOD;   break;
-       case nir_texop_tg4:
-               /* NOTE: a4xx might need to emulate gather w/ txf (this is
-                * what blob does, seems gather  is broken?), and a3xx did
-                * not support it (but probably could also emulate).
-                */
-               switch (tex->component) {
-               case 0:              opc = OPC_GATHER4R; break;
-               case 1:              opc = OPC_GATHER4G; break;
-               case 2:              opc = OPC_GATHER4B; break;
-               case 3:              opc = OPC_GATHER4A; break;
-               }
-               break;
-       case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
-       case nir_texop_txs:
-       case nir_texop_query_levels:
-       case nir_texop_texture_samples:
-       case nir_texop_samples_identical:
-       case nir_texop_txf_ms_mcs:
-               compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
-               return;
-       }
-
-       tex_info(tex, &flags, &coords);
-
-       /*
-        * lay out the first argument in the proper order:
-        *  - actual coordinates first
-        *  - shadow reference
-        *  - array index
-        *  - projection w
-        *  - starting at offset 4, dpdx.xy, dpdy.xy
-        *
-        * bias/lod go into the second arg
-        */
-
-       /* insert tex coords: */
-       for (i = 0; i < coords; i++)
-               src0[i] = coord[i];
-
-       nsrc0 = i;
-
-       /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
-        * with scaled x coord according to requested sample:
-        */
-       if (tex->op == nir_texop_txf_ms) {
-               if (ctx->compiler->txf_ms_with_isaml) {
-                       /* the samples are laid out in x dimension as
-                        *     0 1 2 3
-                        * x_ms = (x << ms) + sample_index;
-                        */
-                       struct ir3_instruction *ms;
-                       ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
-
-                       src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
-                       src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
-
-                       opc = OPC_ISAML;
-               } else {
-                       src0[nsrc0++] = sample_index;
-               }
-       }
-
-       /* scale up integer coords for TXF based on the LOD */
-       if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
-               assert(has_lod);
-               for (i = 0; i < coords; i++)
-                       src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
-       }
-
-       if (coords == 1) {
-               /* hw doesn't do 1d, so we treat it as 2d with
-                * height of 1, and patch up the y coord.
-                * TODO: y coord should be (int)0 in some cases..
-                */
-               src0[nsrc0++] = create_immed(b, fui(0.5));
-       }
-
-       if (tex->is_shadow && tex->op != nir_texop_lod)
-               src0[nsrc0++] = compare;
-
-       if (tex->is_array && tex->op != nir_texop_lod) {
-               struct ir3_instruction *idx = coord[coords];
-
-               /* the array coord for cube arrays needs 0.5 added to it */
-               if (ctx->compiler->array_index_add_half && (opc != OPC_ISAML))
-                       idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
-
-               src0[nsrc0++] = idx;
-       }
-
-       if (has_proj) {
-               src0[nsrc0++] = proj;
-               flags |= IR3_INSTR_P;
-       }
-
-       /* pad to 4, then ddx/ddy: */
-       if (tex->op == nir_texop_txd) {
-               while (nsrc0 < 4)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-               for (i = 0; i < coords; i++)
-                       src0[nsrc0++] = ddx[i];
-               if (coords < 2)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-               for (i = 0; i < coords; i++)
-                       src0[nsrc0++] = ddy[i];
-               if (coords < 2)
-                       src0[nsrc0++] = create_immed(b, fui(0.0));
-       }
-
-       /*
-        * second argument (if applicable):
-        *  - offsets
-        *  - lod
-        *  - bias
-        */
-       if (has_off | has_lod | has_bias) {
-               if (has_off) {
-                       unsigned off_coords = coords;
-                       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-                               off_coords--;
-                       for (i = 0; i < off_coords; i++)
-                               src1[nsrc1++] = off[i];
-                       if (off_coords < 2)
-                               src1[nsrc1++] = create_immed(b, fui(0.0));
-                       flags |= IR3_INSTR_O;
-               }
-
-               if (has_lod | has_bias)
-                       src1[nsrc1++] = lod;
-       }
-
-       switch (tex->dest_type) {
-       case nir_type_invalid:
-       case nir_type_float:
-               type = TYPE_F32;
-               break;
-       case nir_type_int:
-               type = TYPE_S32;
-               break;
-       case nir_type_uint:
-       case nir_type_bool:
-               type = TYPE_U32;
-               break;
-       default:
-               unreachable("bad dest_type");
-       }
-
-       if (opc == OPC_GETLOD)
-               type = TYPE_U32;
-
-       unsigned tex_idx = tex->texture_index;
-
-       ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
-
-       struct ir3_instruction *col0 = create_collect(ctx, src0, nsrc0);
-       struct ir3_instruction *col1 = create_collect(ctx, src1, nsrc1);
-
-       sam = ir3_SAM(b, opc, type, 0b1111, flags,
-                       tex_idx, tex_idx, col0, col1);
-
-       if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
-               /* only need first 3 components: */
-               sam->regs[0]->wrmask = 0x7;
-               split_dest(b, dst, sam, 0, 3);
-
-               /* we need to sample the alpha separately with a non-ASTC
-                * texture state:
-                */
-               sam = ir3_SAM(b, opc, type, 0b1000, flags,
-                               tex_idx, tex_idx, col0, col1);
-
-               array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
-
-               /* fixup .w component: */
-               split_dest(b, &dst[3], sam, 3, 1);
-       } else {
-               /* normal (non-workaround) case: */
-               split_dest(b, dst, sam, 0, 4);
-       }
-
-       /* GETLOD returns results in 4.8 fixed point */
-       if (opc == OPC_GETLOD) {
-               struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
-
-               compile_assert(ctx, tex->dest_type == nir_type_float);
-               for (i = 0; i < 2; i++) {
-                       dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
-                                                          factor, 0);
-               }
-       }
-
-       put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam;
-
-       dst = get_dst(ctx, &tex->dest, 1);
-
-       sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0,
-                       tex->texture_index, tex->texture_index, NULL, NULL);
-
-       /* even though there is only one component, since it ends
-        * up in .z rather than .x, we need a split_dest()
-        */
-       split_dest(b, dst, sam, 0, 3);
-
-       /* The # of levels comes from getinfo.z. We need to add 1 to it, since
-        * the value in TEX_CONST_0 is zero-based.
-        */
-       if (ctx->compiler->levels_add_one)
-               dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
-
-       put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
-{
-       struct ir3_block *b = ctx->block;
-       struct ir3_instruction **dst, *sam;
-       struct ir3_instruction *lod;
-       unsigned flags, coords;
-
-       tex_info(tex, &flags, &coords);
-
-       /* Actually we want the number of dimensions, not coordinates. This
-        * distinction only matters for cubes.
-        */
-       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
-               coords = 2;
-
-       dst = get_dst(ctx, &tex->dest, 4);
-
-       compile_assert(ctx, tex->num_srcs == 1);
-       compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
-
-       lod = get_src(ctx, &tex->src[0].src)[0];
-
-       sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags,
-                       tex->texture_index, tex->texture_index, lod, NULL);
-
-       split_dest(b, dst, sam, 0, 4);
-
-       /* Array size actually ends up in .w rather than .z. This doesn't
-        * matter for miplevel 0, but for higher mips the value in z is
-        * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-        * returned, which means that we have to add 1 to it for arrays.
-        */
-       if (tex->is_array) {
-               if (ctx->compiler->levels_add_one) {
-                       dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
-               } else {
-                       dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
-               }
-       }
-
-       put_dst(ctx, &tex->dest);
-}
-
-static void
-emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
-{
-       switch (jump->type) {
-       case nir_jump_break:
-       case nir_jump_continue:
-       case nir_jump_return:
-               /* I *think* we can simply just ignore this, and use the
-                * successor block link to figure out where we need to
-                * jump to for break/continue
-                */
-               break;
-       default:
-               compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
-               break;
-       }
-}
-
-static void
-emit_instr(struct ir3_context *ctx, nir_instr *instr)
-{
-       switch (instr->type) {
-       case nir_instr_type_alu:
-               emit_alu(ctx, nir_instr_as_alu(instr));
-               break;
-       case nir_instr_type_deref:
-               /* ignored, handled as part of the intrinsic they are src to */
-               break;
-       case nir_instr_type_intrinsic:
-               emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
-               break;
-       case nir_instr_type_load_const:
-               emit_load_const(ctx, nir_instr_as_load_const(instr));
-               break;
-       case nir_instr_type_ssa_undef:
-               emit_undef(ctx, nir_instr_as_ssa_undef(instr));
-               break;
-       case nir_instr_type_tex: {
-               nir_tex_instr *tex = nir_instr_as_tex(instr);
-               /* couple tex instructions get special-cased:
-                */
-               switch (tex->op) {
-               case nir_texop_txs:
-                       emit_tex_txs(ctx, tex);
-                       break;
-               case nir_texop_query_levels:
-                       emit_tex_query_levels(ctx, tex);
-                       break;
-               default:
-                       emit_tex(ctx, tex);
-                       break;
-               }
-               break;
-       }
-       case nir_instr_type_jump:
-               emit_jump(ctx, nir_instr_as_jump(instr));
-               break;
-       case nir_instr_type_phi:
-               /* we have converted phi webs to regs in NIR by now */
-               compile_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type);
-               break;
-       case nir_instr_type_call:
-       case nir_instr_type_parallel_copy:
-               compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
-               break;
-       }
-}
-
-static struct ir3_block *
-get_block(struct ir3_context *ctx, const nir_block *nblock)
-{
-       struct ir3_block *block;
-       struct hash_entry *hentry;
-       unsigned i;
-
-       hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
-       if (hentry)
-               return hentry->data;
-
-       block = ir3_block_create(ctx->ir);
-       block->nblock = nblock;
-       _mesa_hash_table_insert(ctx->block_ht, nblock, block);
-
-       block->predecessors_count = nblock->predecessors->entries;
-       block->predecessors = ralloc_array_size(block,
-               sizeof(block->predecessors[0]), block->predecessors_count);
-       i = 0;
-       set_foreach(nblock->predecessors, sentry) {
-               block->predecessors[i++] = get_block(ctx, sentry->key);
-       }
-
-       return block;
-}
-
-static void
-emit_block(struct ir3_context *ctx, nir_block *nblock)
-{
-       struct ir3_block *block = get_block(ctx, nblock);
-
-       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
-               if (nblock->successors[i]) {
-                       block->successors[i] =
-                               get_block(ctx, nblock->successors[i]);
-               }
-       }
-
-       ctx->block = block;
-       list_addtail(&block->node, &ctx->ir->block_list);
-
-       /* re-emit addr register in each block if needed: */
-       for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) {
-               _mesa_hash_table_destroy(ctx->addr_ht[i], NULL);
-               ctx->addr_ht[i] = NULL;
-       }
-
-       nir_foreach_instr(instr, nblock) {
-               ctx->cur_instr = instr;
-               emit_instr(ctx, instr);
-               ctx->cur_instr = NULL;
-               if (ctx->error)
-                       return;
-       }
-}
-
-static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
-
-static void
-emit_if(struct ir3_context *ctx, nir_if *nif)
-{
-       struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
-
-       ctx->block->condition =
-               get_predicate(ctx, ir3_b2n(condition->block, condition));
-
-       emit_cf_list(ctx, &nif->then_list);
-       emit_cf_list(ctx, &nif->else_list);
-}
-
-static void
-emit_loop(struct ir3_context *ctx, nir_loop *nloop)
-{
-       emit_cf_list(ctx, &nloop->body);
-}
-
-static void
-emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
-{
-       foreach_list_typed(nir_cf_node, node, node, list) {
-               switch (node->type) {
-               case nir_cf_node_block:
-                       emit_block(ctx, nir_cf_node_as_block(node));
-                       break;
-               case nir_cf_node_if:
-                       emit_if(ctx, nir_cf_node_as_if(node));
-                       break;
-               case nir_cf_node_loop:
-                       emit_loop(ctx, nir_cf_node_as_loop(node));
-                       break;
-               case nir_cf_node_function:
-                       compile_error(ctx, "TODO\n");
-                       break;
-               }
-       }
-}
-
-/* emit stream-out code.  At this point, the current block is the original
- * (nir) end block, and nir ensures that all flow control paths terminate
- * into the end block.  We re-purpose the original end block to generate
- * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
- * block holding stream-out write instructions, followed by the new end
- * block:
- *
- *   blockOrigEnd {
- *      p0.x = (vtxcnt < maxvtxcnt)
- *      // succs: blockStreamOut, blockNewEnd
- *   }
- *   blockStreamOut {
- *      ... stream-out instructions ...
- *      // succs: blockNewEnd
- *   }
- *   blockNewEnd {
- *   }
- */
-static void
-emit_stream_out(struct ir3_context *ctx)
-{
-       struct ir3_shader_variant *v = ctx->so;
-       struct ir3 *ir = ctx->ir;
-       struct ir3_stream_output_info *strmout =
-                       &ctx->so->shader->stream_output;
-       struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
-       struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
-       struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
-
-       /* create vtxcnt input in input block at top of shader,
-        * so that it is seen as live over the entire duration
-        * of the shader:
-        */
-       vtxcnt = create_input(ctx, 0);
-       add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
-
-       maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
-
-       /* at this point, we are at the original 'end' block,
-        * re-purpose this block to stream-out condition, then
-        * append stream-out block and new-end block
-        */
-       orig_end_block = ctx->block;
-
-// TODO these blocks need to update predecessors..
-// maybe w/ store_global intrinsic, we could do this
-// stuff in nir->nir pass
-
-       stream_out_block = ir3_block_create(ir);
-       list_addtail(&stream_out_block->node, &ir->block_list);
-
-       new_end_block = ir3_block_create(ir);
-       list_addtail(&new_end_block->node, &ir->block_list);
-
-       orig_end_block->successors[0] = stream_out_block;
-       orig_end_block->successors[1] = new_end_block;
-       stream_out_block->successors[0] = new_end_block;
-
-       /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
-       cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
-       cond->regs[0]->num = regid(REG_P0, 0);
-       cond->cat2.condition = IR3_COND_LT;
-
-       /* condition goes on previous block to the conditional,
-        * since it is used to pick which of the two successor
-        * paths to take:
-        */
-       orig_end_block->condition = cond;
-
-       /* switch to stream_out_block to generate the stream-out
-        * instructions:
-        */
-       ctx->block = stream_out_block;
-
-       /* Calculate base addresses based on vtxcnt.  Instructions
-        * generated for bases not used in following loop will be
-        * stripped out in the backend.
-        */
-       for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
-               unsigned stride = strmout->stride[i];
-               struct ir3_instruction *base, *off;
-
-               base = create_uniform(ctx, regid(v->constbase.tfbo, i));
-
-               /* 24-bit should be enough: */
-               off = ir3_MUL_U(ctx->block, vtxcnt, 0,
-                               create_immed(ctx->block, stride * 4), 0);
-
-               bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
-       }
-
-       /* Generate the per-output store instructions: */
-       for (unsigned i = 0; i < strmout->num_outputs; i++) {
-               for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
-                       unsigned c = j + strmout->output[i].start_component;
-                       struct ir3_instruction *base, *out, *stg;
-
-                       base = bases[strmout->output[i].output_buffer];
-                       out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
-
-                       stg = ir3_STG(ctx->block, base, 0, out, 0,
-                                       create_immed(ctx->block, 1), 0);
-                       stg->cat6.type = TYPE_U32;
-                       stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
-
-                       array_insert(ctx->block, ctx->block->keeps, stg);
-               }
-       }
-
-       /* and finally switch to the new_end_block: */
-       ctx->block = new_end_block;
-}
-
-static void
-emit_function(struct ir3_context *ctx, nir_function_impl *impl)
-{
-       nir_metadata_require(impl, nir_metadata_block_index);
-
-       emit_cf_list(ctx, &impl->body);
-       emit_block(ctx, impl->end_block);
-
-       /* at this point, we should have a single empty block,
-        * into which we emit the 'end' instruction.
-        */
-       compile_assert(ctx, list_empty(&ctx->block->instr_list));
-
-       /* If stream-out (aka transform-feedback) enabled, emit the
-        * stream-out instructions, followed by a new empty block (into
-        * which the 'end' instruction lands).
-        *
-        * NOTE: it is done in this order, rather than inserting before
-        * we emit end_block, because NIR guarantees that all blocks
-        * flow into end_block, and that end_block has no successors.
-        * So by re-purposing end_block as the first block of stream-
-        * out, we guarantee that all exit paths flow into the stream-
-        * out instructions.
-        */
-       if ((ctx->compiler->gpu_id < 500) &&
-                       (ctx->so->shader->stream_output.num_outputs > 0) &&
-                       !ctx->so->binning_pass) {
-               debug_assert(ctx->so->type == MESA_SHADER_VERTEX);
-               emit_stream_out(ctx);
-       }
-
-       ir3_END(ctx->block);
-}
-
-static struct ir3_instruction *
-create_frag_coord(struct ir3_context *ctx, unsigned comp)
-{
-       struct ir3_block *block = ctx->block;
-       struct ir3_instruction *instr;
-
-       if (!ctx->frag_coord) {
-               ctx->frag_coord = create_input_compmask(ctx, 0, 0xf);
-               /* defer add_sysval_input() until after all inputs created */
-       }
-
-       split_dest(block, &instr, ctx->frag_coord, comp, 1);
-
-       switch (comp) {
-       case 0: /* .x */
-       case 1: /* .y */
-               /* for frag_coord, we get unsigned values.. we need
-                * to subtract (integer) 8 and divide by 16 (right-
-                * shift by 4) then convert to float:
-                *
-                *    sub.s tmp, src, 8
-                *    shr.b tmp, tmp, 4
-                *    mov.u32f32 dst, tmp
-                *
-                */
-               instr = ir3_SUB_S(block, instr, 0,
-                               create_immed(block, 8), 0);
-               instr = ir3_SHR_B(block, instr, 0,
-                               create_immed(block, 4), 0);
-               instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
-
-               return instr;
-       case 2: /* .z */
-       case 3: /* .w */
-       default:
-               /* seems that we can use these as-is: */
-               return instr;
-       }
-}
-
-static void
-setup_input(struct ir3_context *ctx, nir_variable *in)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       unsigned ncomp = glsl_get_components(in->type);
-       unsigned n = in->data.driver_location;
-       unsigned slot = in->data.location;
-
-       /* let's pretend things other than vec4 don't exist: */
-       ncomp = MAX2(ncomp, 4);
-
-       /* skip unread inputs, we could end up with (for example), unsplit
-        * matrix/etc inputs in the case they are not read, so just silently
-        * skip these.
-        */
-       if (ncomp > 4)
-               return;
-
-       compile_assert(ctx, ncomp == 4);
-
-       so->inputs[n].slot = slot;
-       so->inputs[n].compmask = (1 << ncomp) - 1;
-       so->inputs_count = MAX2(so->inputs_count, n + 1);
-       so->inputs[n].interpolate = in->data.interpolation;
-
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               for (int i = 0; i < ncomp; i++) {
-                       struct ir3_instruction *instr = NULL;
-                       unsigned idx = (n * 4) + i;
-
-                       if (slot == VARYING_SLOT_POS) {
-                               so->inputs[n].bary = false;
-                               so->frag_coord = true;
-                               instr = create_frag_coord(ctx, i);
-                       } else if (slot == VARYING_SLOT_PNTC) {
-                               /* see for example st_nir_fixup_varying_slots().. this is
-                                * maybe a bit mesa/st specific.  But we need things to line
-                                * up for this in fdN_program:
-                                *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
-                                *    if (emit->sprite_coord_enable & texmask) {
-                                *       ...
-                                *    }
-                                */
-                               so->inputs[n].slot = VARYING_SLOT_VAR8;
-                               so->inputs[n].bary = true;
-                               instr = create_frag_input(ctx, false);
-                       } else {
-                               bool use_ldlv = false;
-
-                               /* detect the special case for front/back colors where
-                                * we need to do flat vs smooth shading depending on
-                                * rast state:
-                                */
-                               if (in->data.interpolation == INTERP_MODE_NONE) {
-                                       switch (slot) {
-                                       case VARYING_SLOT_COL0:
-                                       case VARYING_SLOT_COL1:
-                                       case VARYING_SLOT_BFC0:
-                                       case VARYING_SLOT_BFC1:
-                                               so->inputs[n].rasterflat = true;
-                                               break;
-                                       default:
-                                               break;
-                                       }
-                               }
-
-                               if (ctx->compiler->flat_bypass) {
-                                       if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
-                                                       (so->inputs[n].rasterflat && ctx->so->key.rasterflat))
-                                               use_ldlv = true;
-                               }
-
-                               so->inputs[n].bary = true;
-
-                               instr = create_frag_input(ctx, use_ldlv);
-                       }
-
-                       compile_assert(ctx, idx < ctx->ir->ninputs);
-
-                       ctx->ir->inputs[idx] = instr;
-               }
-       } else if (ctx->so->type == MESA_SHADER_VERTEX) {
-               for (int i = 0; i < ncomp; i++) {
-                       unsigned idx = (n * 4) + i;
-                       compile_assert(ctx, idx < ctx->ir->ninputs);
-                       ctx->ir->inputs[idx] = create_input(ctx, idx);
-               }
-       } else {
-               compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
-       }
-
-       if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) {
-               so->total_in += ncomp;
-       }
-}
-
-static void
-setup_output(struct ir3_context *ctx, nir_variable *out)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       unsigned ncomp = glsl_get_components(out->type);
-       unsigned n = out->data.driver_location;
-       unsigned slot = out->data.location;
-       unsigned comp = 0;
-
-       /* let's pretend things other than vec4 don't exist: */
-       ncomp = MAX2(ncomp, 4);
-       compile_assert(ctx, ncomp == 4);
-
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               switch (slot) {
-               case FRAG_RESULT_DEPTH:
-                       comp = 2;  /* tgsi will write to .z component */
-                       so->writes_pos = true;
-                       break;
-               case FRAG_RESULT_COLOR:
-                       so->color0_mrt = 1;
-                       break;
-               default:
-                       if (slot >= FRAG_RESULT_DATA0)
-                               break;
-                       compile_error(ctx, "unknown FS output name: %s\n",
-                                       gl_frag_result_name(slot));
-               }
-       } else if (ctx->so->type == MESA_SHADER_VERTEX) {
-               switch (slot) {
-               case VARYING_SLOT_POS:
-                       so->writes_pos = true;
-                       break;
-               case VARYING_SLOT_PSIZ:
-                       so->writes_psize = true;
-                       break;
-               case VARYING_SLOT_COL0:
-               case VARYING_SLOT_COL1:
-               case VARYING_SLOT_BFC0:
-               case VARYING_SLOT_BFC1:
-               case VARYING_SLOT_FOGC:
-               case VARYING_SLOT_CLIP_DIST0:
-               case VARYING_SLOT_CLIP_DIST1:
-               case VARYING_SLOT_CLIP_VERTEX:
-                       break;
-               default:
-                       if (slot >= VARYING_SLOT_VAR0)
-                               break;
-                       if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
-                               break;
-                       compile_error(ctx, "unknown VS output name: %s\n",
-                                       gl_varying_slot_name(slot));
-               }
-       } else {
-               compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
-       }
-
-       compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-
-       so->outputs[n].slot = slot;
-       so->outputs[n].regid = regid(n, comp);
-       so->outputs_count = MAX2(so->outputs_count, n + 1);
-
-       for (int i = 0; i < ncomp; i++) {
-               unsigned idx = (n * 4) + i;
-               compile_assert(ctx, idx < ctx->ir->noutputs);
-               ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
-       }
-}
-
-static int
-max_drvloc(struct exec_list *vars)
-{
-       int drvloc = -1;
-       nir_foreach_variable(var, vars) {
-               drvloc = MAX2(drvloc, (int)var->data.driver_location);
-       }
-       return drvloc;
-}
-
-static const unsigned max_sysvals[] = {
-       [MESA_SHADER_FRAGMENT] = 24,  // TODO
-       [MESA_SHADER_VERTEX]  = 16,
-       [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need?
-};
-
-static void
-emit_instructions(struct ir3_context *ctx)
-{
-       unsigned ninputs, noutputs;
-       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
-
-       ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
-       noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
-
-       /* we need to leave room for sysvals:
-        */
-       ninputs += max_sysvals[ctx->so->type];
-
-       ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
-
-       /* Create inputs in first block: */
-       ctx->block = get_block(ctx, nir_start_block(fxn));
-       ctx->in_block = ctx->block;
-       list_addtail(&ctx->block->node, &ctx->ir->block_list);
-
-       ninputs -= max_sysvals[ctx->so->type];
-
-       /* for fragment shader, the vcoord input register is used as the
-        * base for bary.f varying fetch instrs:
-        */
-       struct ir3_instruction *vcoord = NULL;
-       if (ctx->so->type == MESA_SHADER_FRAGMENT) {
-               struct ir3_instruction *xy[2];
-
-               vcoord = create_input_compmask(ctx, 0, 0x3);
-               split_dest(ctx->block, xy, vcoord, 0, 2);
-
-               ctx->frag_vcoord = create_collect(ctx, xy, 2);
-       }
-
-       /* Setup inputs: */
-       nir_foreach_variable(var, &ctx->s->inputs) {
-               setup_input(ctx, var);
-       }
-
-       /* Defer add_sysval_input() stuff until after setup_inputs(),
-        * because sysvals need to be appended after varyings:
-        */
-       if (vcoord) {
-               add_sysval_input_compmask(ctx, SYSTEM_VALUE_VARYING_COORD,
-                               0x3, vcoord);
-       }
-
-       if (ctx->frag_coord) {
-               add_sysval_input_compmask(ctx, SYSTEM_VALUE_FRAG_COORD,
-                               0xf, ctx->frag_coord);
-       }
-
-       /* Setup outputs: */
-       nir_foreach_variable(var, &ctx->s->outputs) {
-               setup_output(ctx, var);
-       }
-
-       /* Setup registers (which should only be arrays): */
-       nir_foreach_register(reg, &ctx->s->registers) {
-               declare_array(ctx, reg);
-       }
-
-       /* NOTE: need to do something more clever when we support >1 fxn */
-       nir_foreach_register(reg, &fxn->registers) {
-               declare_array(ctx, reg);
-       }
-       /* And emit the body: */
-       ctx->impl = fxn;
-       emit_function(ctx, fxn);
-}
-
-/* from NIR perspective, we actually have varying inputs.  But the varying
- * inputs, from an IR standpoint, are just bary.f/ldlv instructions.  The
- * only actual inputs are the sysvals.
- */
-static void
-fixup_frag_inputs(struct ir3_context *ctx)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       struct ir3 *ir = ctx->ir;
-       unsigned i = 0;
-
-       /* sysvals should appear at the end of the inputs, drop everything else: */
-       while ((i < so->inputs_count) && !so->inputs[i].sysval)
-               i++;
-
-       /* at IR level, inputs are always blocks of 4 scalars: */
-       i *= 4;
-
-       ir->inputs = &ir->inputs[i];
-       ir->ninputs -= i;
-}
-
-/* Fixup tex sampler state for astc/srgb workaround instructions.  We
- * need to assign the tex state indexes for these after we know the
- * max tex index.
- */
-static void
-fixup_astc_srgb(struct ir3_context *ctx)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       /* indexed by original tex idx, value is newly assigned alpha sampler
-        * state tex idx.  Zero is invalid since there is at least one sampler
-        * if we get here.
-        */
-       unsigned alt_tex_state[16] = {0};
-       unsigned tex_idx = ctx->max_texture_index + 1;
-       unsigned idx = 0;
-
-       so->astc_srgb.base = tex_idx;
-
-       for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
-               struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
-
-               compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
-
-               if (alt_tex_state[sam->cat5.tex] == 0) {
-                       /* assign new alternate/alpha tex state slot: */
-                       alt_tex_state[sam->cat5.tex] = tex_idx++;
-                       so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
-                       so->astc_srgb.count++;
-               }
-
-               sam->cat5.tex = alt_tex_state[sam->cat5.tex];
-       }
-}
-
-static void
-fixup_binning_pass(struct ir3_context *ctx)
-{
-       struct ir3_shader_variant *so = ctx->so;
-       struct ir3 *ir = ctx->ir;
-       unsigned i, j;
-
-       for (i = 0, j = 0; i < so->outputs_count; i++) {
-               unsigned slot = so->outputs[i].slot;
-
-               /* throw away everything but first position/psize */
-               if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
-                       if (i != j) {
-                               so->outputs[j] = so->outputs[i];
-                               ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
-                               ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
-                               ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
-                               ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
-                       }
-                       j++;
-               }
-       }
-       so->outputs_count = j;
-       ir->noutputs = j * 4;
-}
-
-int
-ir3_compile_shader_nir(struct ir3_compiler *compiler,
-               struct ir3_shader_variant *so)
-{
-       struct ir3_context *ctx;
-       struct ir3 *ir;
-       struct ir3_instruction **inputs;
-       unsigned i, actual_in, inloc;
-       int ret = 0, max_bary;
-
-       assert(!so->ir);
-
-       ctx = compile_init(compiler, so);
-       if (!ctx) {
-               DBG("INIT failed!");
-               ret = -1;
-               goto out;
-       }
-
-       emit_instructions(ctx);
-
-       if (ctx->error) {
-               DBG("EMIT failed!");
-               ret = -1;
-               goto out;
-       }
-
-       ir = so->ir = ctx->ir;
-
-       /* keep track of the inputs from TGSI perspective.. */
-       inputs = ir->inputs;
-
-       /* but fixup actual inputs for frag shader: */
-       if (so->type == MESA_SHADER_FRAGMENT)
-               fixup_frag_inputs(ctx);
-
-       /* at this point, for binning pass, throw away unneeded outputs: */
-       if (so->binning_pass && (ctx->compiler->gpu_id < 600))
-               fixup_binning_pass(ctx);
-
-       /* if we want half-precision outputs, mark the output registers
-        * as half:
-        */
-       if (so->key.half_precision) {
-               for (i = 0; i < ir->noutputs; i++) {
-                       struct ir3_instruction *out = ir->outputs[i];
-
-                       if (!out)
-                               continue;
-
-                       /* if frag shader writes z, that needs to be full precision: */
-                       if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH)
-                               continue;
-
-                       out->regs[0]->flags |= IR3_REG_HALF;
-                       /* output could be a fanout (ie. texture fetch output)
-                        * in which case we need to propagate the half-reg flag
-                        * up to the definer so that RA sees it:
-                        */
-                       if (out->opc == OPC_META_FO) {
-                               out = out->regs[1]->instr;
-                               out->regs[0]->flags |= IR3_REG_HALF;
-                       }
-
-                       if (out->opc == OPC_MOV) {
-                               out->cat1.dst_type = half_type(out->cat1.dst_type);
-                       }
-               }
-       }
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("BEFORE CP:\n");
-               ir3_print(ir);
-       }
-
-       ir3_cp(ir, so);
-
-       /* at this point, for binning pass, throw away unneeded outputs:
-        * Note that for a6xx and later, we do this after ir3_cp to ensure
-        * that the uniform/constant layout for BS and VS matches, so that
-        * we can re-use same VS_CONST state group.
-        */
-       if (so->binning_pass && (ctx->compiler->gpu_id >= 600))
-               fixup_binning_pass(ctx);
-
-       /* Insert mov if there's same instruction for each output.
-        * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow
-        */
-       for (int i = ir->noutputs - 1; i >= 0; i--) {
-               if (!ir->outputs[i])
-                       continue;
-               for (unsigned j = 0; j < i; j++) {
-                       if (ir->outputs[i] == ir->outputs[j]) {
-                               ir->outputs[i] =
-                                       ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32);
-                       }
-               }
-       }
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("BEFORE GROUPING:\n");
-               ir3_print(ir);
-       }
-
-       ir3_sched_add_deps(ir);
-
-       /* Group left/right neighbors, inserting mov's where needed to
-        * solve conflicts:
-        */
-       ir3_group(ir);
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("AFTER GROUPING:\n");
-               ir3_print(ir);
-       }
-
-       ir3_depth(ir);
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("AFTER DEPTH:\n");
-               ir3_print(ir);
-       }
-
-       ret = ir3_sched(ir);
-       if (ret) {
-               DBG("SCHED failed!");
-               goto out;
-       }
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("AFTER SCHED:\n");
-               ir3_print(ir);
-       }
-
-       ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
-       if (ret) {
-               DBG("RA failed!");
-               goto out;
-       }
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("AFTER RA:\n");
-               ir3_print(ir);
-       }
-
-       /* fixup input/outputs: */
-       for (i = 0; i < so->outputs_count; i++) {
-               so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
-       }
-
-       /* Note that some or all channels of an input may be unused: */
-       actual_in = 0;
-       inloc = 0;
-       for (i = 0; i < so->inputs_count; i++) {
-               unsigned j, reg = regid(63,0), compmask = 0, maxcomp = 0;
-               so->inputs[i].ncomp = 0;
-               so->inputs[i].inloc = inloc;
-               for (j = 0; j < 4; j++) {
-                       struct ir3_instruction *in = inputs[(i*4) + j];
-                       if (in && !(in->flags & IR3_INSTR_UNUSED)) {
-                               compmask |= (1 << j);
-                               reg = in->regs[0]->num - j;
-                               actual_in++;
-                               so->inputs[i].ncomp++;
-                               if ((so->type == MESA_SHADER_FRAGMENT) && so->inputs[i].bary) {
-                                       /* assign inloc: */
-                                       assert(in->regs[1]->flags & IR3_REG_IMMED);
-                                       in->regs[1]->iim_val = inloc + j;
-                                       maxcomp = j + 1;
-                               }
-                       }
-               }
-               if ((so->type == MESA_SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
-                       so->varying_in++;
-                       so->inputs[i].compmask = (1 << maxcomp) - 1;
-                       inloc += maxcomp;
-               } else if (!so->inputs[i].sysval) {
-                       so->inputs[i].compmask = compmask;
-               }
-               so->inputs[i].regid = reg;
-       }
-
-       if (ctx->astc_srgb)
-               fixup_astc_srgb(ctx);
-
-       /* We need to do legalize after (for frag shader's) the "bary.f"
-        * offsets (inloc) have been assigned.
-        */
-       ir3_legalize(ir, &so->num_samp, &so->has_ssbo, &max_bary);
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               printf("AFTER LEGALIZE:\n");
-               ir3_print(ir);
-       }
-
-       /* Note that actual_in counts inputs that are not bary.f'd for FS: */
-       if (so->type == MESA_SHADER_VERTEX)
-               so->total_in = actual_in;
-       else
-               so->total_in = max_bary + 1;
-
-out:
-       if (ret) {
-               if (so->ir)
-                       ir3_destroy(so->ir);
-               so->ir = NULL;
-       }
-       compile_free(ctx);
-
-       return ret;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c

deleted file mode 100644 (file)

index e8e8cc3..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <math.h>
-
-#include "ir3.h"
-#include "ir3_shader.h"
-
-/*
- * Copy Propagate:
- */
-
-struct ir3_cp_ctx {
-       struct ir3 *shader;
-       struct ir3_shader_variant *so;
-       unsigned immediate_idx;
-};
-
-/* is it a type preserving mov, with ok flags? */
-static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
-{
-       if (is_same_type_mov(instr)) {
-               struct ir3_register *dst = instr->regs[0];
-               struct ir3_register *src = instr->regs[1];
-               struct ir3_instruction *src_instr = ssa(src);
-
-               /* only if mov src is SSA (not const/immed): */
-               if (!src_instr)
-                       return false;
-
-               /* no indirect: */
-               if (dst->flags & IR3_REG_RELATIV)
-                       return false;
-               if (src->flags & IR3_REG_RELATIV)
-                       return false;
-
-               if (src->flags & IR3_REG_ARRAY)
-                       return false;
-
-               if (!allow_flags)
-                       if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
-                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-                               return false;
-
-               /* TODO: remove this hack: */
-               if (src_instr->opc == OPC_META_FO)
-                       return false;
-
-               return true;
-       }
-       return false;
-}
-
-static unsigned cp_flags(unsigned flags)
-{
-       /* only considering these flags (at least for now): */
-       flags &= (IR3_REG_CONST | IR3_REG_IMMED |
-                       IR3_REG_FNEG | IR3_REG_FABS |
-                       IR3_REG_SNEG | IR3_REG_SABS |
-                       IR3_REG_BNOT | IR3_REG_RELATIV);
-       return flags;
-}
-
-static bool valid_flags(struct ir3_instruction *instr, unsigned n,
-               unsigned flags)
-{
-       unsigned valid_flags;
-       flags = cp_flags(flags);
-
-       /* If destination is indirect, then source cannot be.. at least
-        * I don't think so..
-        */
-       if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
-                       (flags & IR3_REG_RELATIV))
-               return false;
-
-       /* TODO it seems to *mostly* work to cp RELATIV, except we get some
-        * intermittent piglit variable-indexing fails.  Newer blob driver
-        * doesn't seem to cp these.  Possibly this is hw workaround?  Not
-        * sure, but until that is understood better, lets just switch off
-        * cp for indirect src's:
-        */
-       if (flags & IR3_REG_RELATIV)
-               return false;
-
-       switch (opc_cat(instr->opc)) {
-       case 1:
-               valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
-               if (flags & ~valid_flags)
-                       return false;
-               break;
-       case 2:
-               valid_flags = ir3_cat2_absneg(instr->opc) |
-                               IR3_REG_CONST | IR3_REG_RELATIV;
-
-               if (ir3_cat2_int(instr->opc))
-                       valid_flags |= IR3_REG_IMMED;
-
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
-                       unsigned m = (n ^ 1) + 1;
-                       /* cannot deal w/ const in both srcs:
-                        * (note that some cat2 actually only have a single src)
-                        */
-                       if (m < instr->regs_count) {
-                               struct ir3_register *reg = instr->regs[m];
-                               if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
-                                       return false;
-                               if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
-                                       return false;
-                       }
-                       /* cannot be const + ABS|NEG: */
-                       if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
-                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-                               return false;
-               }
-               break;
-       case 3:
-               valid_flags = ir3_cat3_absneg(instr->opc) |
-                               IR3_REG_CONST | IR3_REG_RELATIV;
-
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
-                       /* cannot deal w/ const/relativ in 2nd src: */
-                       if (n == 1)
-                               return false;
-               }
-
-               if (flags & IR3_REG_CONST) {
-                       /* cannot be const + ABS|NEG: */
-                       if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
-                                       IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
-                               return false;
-               }
-               break;
-       case 4:
-               /* seems like blob compiler avoids const as src.. */
-               /* TODO double check if this is still the case on a4xx */
-               if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
-                       return false;
-               if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
-                       return false;
-               break;
-       case 5:
-               /* no flags allowed */
-               if (flags)
-                       return false;
-               break;
-       case 6:
-               valid_flags = IR3_REG_IMMED;
-               if (flags & ~valid_flags)
-                       return false;
-
-               if (flags & IR3_REG_IMMED) {
-                       /* doesn't seem like we can have immediate src for store
-                        * instructions:
-                        *
-                        * TODO this restriction could also apply to load instructions,
-                        * but for load instructions this arg is the address (and not
-                        * really sure any good way to test a hard-coded immed addr src)
-                        */
-                       if (is_store(instr) && (n == 1))
-                               return false;
-
-                       if ((instr->opc == OPC_LDL) && (n != 1))
-                               return false;
-
-                       if ((instr->opc == OPC_STL) && (n != 2))
-                               return false;
-
-                       /* disallow CP into anything but the SSBO slot argument for
-                        * atomics:
-                        */
-                       if (is_atomic(instr->opc) && (n != 0))
-                               return false;
-
-                       if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
-                               return false;
-               }
-
-               break;
-       }
-
-       return true;
-}
-
-/* propagate register flags from src to dst.. negates need special
- * handling to cancel each other out.
- */
-static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
-{
-       unsigned srcflags = src->regs[1]->flags;
-
-       /* if what we are combining into already has (abs) flags,
-        * we can drop (neg) from src:
-        */
-       if (*dstflags & IR3_REG_FABS)
-               srcflags &= ~IR3_REG_FNEG;
-       if (*dstflags & IR3_REG_SABS)
-               srcflags &= ~IR3_REG_SNEG;
-
-       if (srcflags & IR3_REG_FABS)
-               *dstflags |= IR3_REG_FABS;
-       if (srcflags & IR3_REG_SABS)
-               *dstflags |= IR3_REG_SABS;
-       if (srcflags & IR3_REG_FNEG)
-               *dstflags ^= IR3_REG_FNEG;
-       if (srcflags & IR3_REG_SNEG)
-               *dstflags ^= IR3_REG_SNEG;
-       if (srcflags & IR3_REG_BNOT)
-               *dstflags ^= IR3_REG_BNOT;
-
-       *dstflags &= ~IR3_REG_SSA;
-       *dstflags |= srcflags & IR3_REG_SSA;
-       *dstflags |= srcflags & IR3_REG_CONST;
-       *dstflags |= srcflags & IR3_REG_IMMED;
-       *dstflags |= srcflags & IR3_REG_RELATIV;
-       *dstflags |= srcflags & IR3_REG_ARRAY;
-
-       /* if src of the src is boolean we can drop the (abs) since we know
-        * the source value is already a postitive integer.  This cleans
-        * up the absnegs that get inserted when converting between nir and
-        * native boolean (see ir3_b2n/n2b)
-        */
-       struct ir3_instruction *srcsrc = ssa(src->regs[1]);
-       if (srcsrc && is_bool(srcsrc))
-               *dstflags &= ~IR3_REG_SABS;
-}
-
-static struct ir3_register *
-lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
-{
-       unsigned swiz, idx, i;
-
-       reg = ir3_reg_clone(ctx->shader, reg);
-
-       /* in some cases, there are restrictions on (abs)/(neg) plus const..
-        * so just evaluate those and clear the flags:
-        */
-       if (new_flags & IR3_REG_SABS) {
-               reg->iim_val = abs(reg->iim_val);
-               new_flags &= ~IR3_REG_SABS;
-       }
-
-       if (new_flags & IR3_REG_FABS) {
-               reg->fim_val = fabs(reg->fim_val);
-               new_flags &= ~IR3_REG_FABS;
-       }
-
-       if (new_flags & IR3_REG_SNEG) {
-               reg->iim_val = -reg->iim_val;
-               new_flags &= ~IR3_REG_SNEG;
-       }
-
-       if (new_flags & IR3_REG_FNEG) {
-               reg->fim_val = -reg->fim_val;
-               new_flags &= ~IR3_REG_FNEG;
-       }
-
-       /* Reallocate for 4 more elements whenever it's necessary */
-       if (ctx->immediate_idx == ctx->so->immediates_size * 4) {
-               ctx->so->immediates_size += 4;
-               ctx->so->immediates = realloc (ctx->so->immediates,
-                       ctx->so->immediates_size * sizeof (ctx->so->immediates[0]));
-       }
-
-       for (i = 0; i < ctx->immediate_idx; i++) {
-               swiz = i % 4;
-               idx  = i / 4;
-
-               if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
-                       break;
-               }
-       }
-
-       if (i == ctx->immediate_idx) {
-               /* need to generate a new immediate: */
-               swiz = i % 4;
-               idx  = i / 4;
-               ctx->so->immediates[idx].val[swiz] = reg->uim_val;
-               ctx->so->immediates_count = idx + 1;
-               ctx->immediate_idx++;
-       }
-
-       new_flags &= ~IR3_REG_IMMED;
-       new_flags |= IR3_REG_CONST;
-       reg->flags = new_flags;
-       reg->num = i + (4 * ctx->so->constbase.immediate);
-
-       return reg;
-}
-
-static void
-unuse(struct ir3_instruction *instr)
-{
-       debug_assert(instr->use_count > 0);
-
-       if (--instr->use_count == 0) {
-               struct ir3_block *block = instr->block;
-
-               instr->barrier_class = 0;
-               instr->barrier_conflict = 0;
-
-               /* we don't want to remove anything in keeps (which could
-                * be things like array store's)
-                */
-               for (unsigned i = 0; i < block->keeps_count; i++) {
-                       debug_assert(block->keeps[i] != instr);
-               }
-       }
-}
-
-/**
- * Handle cp for a given src register.  This additionally handles
- * the cases of collapsing immedate/const (which replace the src
- * register with a non-ssa src) or collapsing mov's from relative
- * src (which needs to also fixup the address src reference by the
- * instruction).
- */
-static void
-reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
-               struct ir3_register *reg, unsigned n)
-{
-       struct ir3_instruction *src = ssa(reg);
-
-       if (is_eligible_mov(src, true)) {
-               /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
-               struct ir3_register *src_reg = src->regs[1];
-               unsigned new_flags = reg->flags;
-
-               combine_flags(&new_flags, src);
-
-               if (valid_flags(instr, n, new_flags)) {
-                       if (new_flags & IR3_REG_ARRAY) {
-                               debug_assert(!(reg->flags & IR3_REG_ARRAY));
-                               reg->array = src_reg->array;
-                       }
-                       reg->flags = new_flags;
-                       reg->instr = ssa(src_reg);
-
-                       instr->barrier_class |= src->barrier_class;
-                       instr->barrier_conflict |= src->barrier_conflict;
-
-                       unuse(src);
-                       reg->instr->use_count++;
-               }
-
-       } else if (is_same_type_mov(src) &&
-                       /* cannot collapse const/immed/etc into meta instrs: */
-                       !is_meta(instr)) {
-               /* immed/const/etc cases, which require some special handling: */
-               struct ir3_register *src_reg = src->regs[1];
-               unsigned new_flags = reg->flags;
-
-               combine_flags(&new_flags, src);
-
-               if (!valid_flags(instr, n, new_flags)) {
-                       /* See if lowering an immediate to const would help. */
-                       if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
-                               debug_assert(new_flags & IR3_REG_IMMED);
-                               instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
-                               return;
-                       }
-
-                       /* special case for "normal" mad instructions, we can
-                        * try swapping the first two args if that fits better.
-                        *
-                        * the "plain" MAD's (ie. the ones that don't shift first
-                        * src prior to multiply) can swap their first two srcs if
-                        * src[0] is !CONST and src[1] is CONST:
-                        */
-                       if ((n == 1) && is_mad(instr->opc) &&
-                                       !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
-                                       valid_flags(instr, 0, new_flags & ~IR3_REG_IMMED)) {
-                               /* swap src[0] and src[1]: */
-                               struct ir3_register *tmp;
-                               tmp = instr->regs[0 + 1];
-                               instr->regs[0 + 1] = instr->regs[1 + 1];
-                               instr->regs[1 + 1] = tmp;
-
-                               n = 0;
-                       } else {
-                               return;
-                       }
-               }
-
-               /* Here we handle the special case of mov from
-                * CONST and/or RELATIV.  These need to be handled
-                * specially, because in the case of move from CONST
-                * there is no src ir3_instruction so we need to
-                * replace the ir3_register.  And in the case of
-                * RELATIV we need to handle the address register
-                * dependency.
-                */
-               if (src_reg->flags & IR3_REG_CONST) {
-                       /* an instruction cannot reference two different
-                        * address registers:
-                        */
-                       if ((src_reg->flags & IR3_REG_RELATIV) &&
-                                       conflicts(instr->address, reg->instr->address))
-                               return;
-
-                       /* This seems to be a hw bug, or something where the timings
-                        * just somehow don't work out.  This restriction may only
-                        * apply if the first src is also CONST.
-                        */
-                       if ((opc_cat(instr->opc) == 3) && (n == 2) &&
-                                       (src_reg->flags & IR3_REG_RELATIV) &&
-                                       (src_reg->array.offset == 0))
-                               return;
-
-                       src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-                       src_reg->flags = new_flags;
-                       instr->regs[n+1] = src_reg;
-
-                       if (src_reg->flags & IR3_REG_RELATIV)
-                               ir3_instr_set_address(instr, reg->instr->address);
-
-                       return;
-               }
-
-               if ((src_reg->flags & IR3_REG_RELATIV) &&
-                               !conflicts(instr->address, reg->instr->address)) {
-                       src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-                       src_reg->flags = new_flags;
-                       instr->regs[n+1] = src_reg;
-                       ir3_instr_set_address(instr, reg->instr->address);
-
-                       return;
-               }
-
-               /* NOTE: seems we can only do immed integers, so don't
-                * need to care about float.  But we do need to handle
-                * abs/neg *before* checking that the immediate requires
-                * few enough bits to encode:
-                *
-                * TODO: do we need to do something to avoid accidentally
-                * catching a float immed?
-                */
-               if (src_reg->flags & IR3_REG_IMMED) {
-                       int32_t iim_val = src_reg->iim_val;
-
-                       debug_assert((opc_cat(instr->opc) == 1) ||
-                                       (opc_cat(instr->opc) == 6) ||
-                                       ir3_cat2_int(instr->opc) ||
-                                       (is_mad(instr->opc) && (n == 0)));
-
-                       if (new_flags & IR3_REG_SABS)
-                               iim_val = abs(iim_val);
-
-                       if (new_flags & IR3_REG_SNEG)
-                               iim_val = -iim_val;
-
-                       if (new_flags & IR3_REG_BNOT)
-                               iim_val = ~iim_val;
-
-                       /* other than category 1 (mov) we can only encode up to 10 bits: */
-                       if ((instr->opc == OPC_MOV) ||
-                                       !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
-                               new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
-                               src_reg = ir3_reg_clone(instr->block->shader, src_reg);
-                               src_reg->flags = new_flags;
-                               src_reg->iim_val = iim_val;
-                               instr->regs[n+1] = src_reg;
-                       } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
-                               /* See if lowering an immediate to const would help. */
-                               instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
-                       }
-
-                       return;
-               }
-       }
-}
-
-/* Handle special case of eliminating output mov, and similar cases where
- * there isn't a normal "consuming" instruction.  In this case we cannot
- * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
- * be eliminated)
- */
-static struct ir3_instruction *
-eliminate_output_mov(struct ir3_instruction *instr)
-{
-       if (is_eligible_mov(instr, false)) {
-               struct ir3_register *reg = instr->regs[1];
-               if (!(reg->flags & IR3_REG_ARRAY)) {
-                       struct ir3_instruction *src_instr = ssa(reg);
-                       debug_assert(src_instr);
-                       return src_instr;
-               }
-       }
-       return instr;
-}
-
-/**
- * Find instruction src's which are mov's that can be collapsed, replacing
- * the mov dst with the mov src
- */
-static void
-instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
-{
-       struct ir3_register *reg;
-
-       if (instr->regs_count == 0)
-               return;
-
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       /* walk down the graph from each src: */
-       foreach_src_n(reg, n, instr) {
-               struct ir3_instruction *src = ssa(reg);
-
-               if (!src)
-                       continue;
-
-               instr_cp(ctx, src);
-
-               /* TODO non-indirect access we could figure out which register
-                * we actually want and allow cp..
-                */
-               if (reg->flags & IR3_REG_ARRAY)
-                       continue;
-
-               /* Don't CP absneg into meta instructions, that won't end well: */
-               if (is_meta(instr) && (src->opc != OPC_MOV))
-                       continue;
-
-               reg_cp(ctx, instr, reg, n);
-       }
-
-       if (instr->regs[0]->flags & IR3_REG_ARRAY) {
-               struct ir3_instruction *src = ssa(instr->regs[0]);
-               if (src)
-                       instr_cp(ctx, src);
-       }
-
-       if (instr->address) {
-               instr_cp(ctx, instr->address);
-               ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
-       }
-
-       /* we can end up with extra cmps.s from frontend, which uses a
-        *
-        *    cmps.s p0.x, cond, 0
-        *
-        * as a way to mov into the predicate register.  But frequently 'cond'
-        * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
-        * just re-write the instruction writing predicate register to get rid
-        * of the double cmps.
-        */
-       if ((instr->opc == OPC_CMPS_S) &&
-                       (instr->regs[0]->num == regid(REG_P0, 0)) &&
-                       ssa(instr->regs[1]) &&
-                       (instr->regs[2]->flags & IR3_REG_IMMED) &&
-                       (instr->regs[2]->iim_val == 0)) {
-               struct ir3_instruction *cond = ssa(instr->regs[1]);
-               switch (cond->opc) {
-               case OPC_CMPS_S:
-               case OPC_CMPS_F:
-               case OPC_CMPS_U:
-                       instr->opc   = cond->opc;
-                       instr->flags = cond->flags;
-                       instr->cat2  = cond->cat2;
-                       instr->address = cond->address;
-                       instr->regs[1] = cond->regs[1];
-                       instr->regs[2] = cond->regs[2];
-                       instr->barrier_class |= cond->barrier_class;
-                       instr->barrier_conflict |= cond->barrier_conflict;
-                       unuse(cond);
-                       break;
-               default:
-                       break;
-               }
-       }
-}
-
-void
-ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
-{
-       struct ir3_cp_ctx ctx = {
-                       .shader = ir,
-                       .so = so,
-       };
-
-       /* This is a bit annoying, and probably wouldn't be necessary if we
-        * tracked a reverse link from producing instruction to consumer.
-        * But we need to know when we've eliminated the last consumer of
-        * a mov, so we need to do a pass to first count consumers of a
-        * mov.
-        */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       struct ir3_instruction *src;
-
-                       /* by the way, we don't account for false-dep's, so the CP
-                        * pass should always happen before false-dep's are inserted
-                        */
-                       debug_assert(instr->deps_count == 0);
-
-                       foreach_ssa_src(src, instr) {
-                               src->use_count++;
-                       }
-               }
-       }
-
-       ir3_clear_mark(ir);
-
-       for (unsigned i = 0; i < ir->noutputs; i++) {
-               if (ir->outputs[i]) {
-                       instr_cp(&ctx, ir->outputs[i]);
-                       ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
-               }
-       }
-
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               if (block->condition) {
-                       instr_cp(&ctx, block->condition);
-                       block->condition = eliminate_output_mov(block->condition);
-               }
-
-               for (unsigned i = 0; i < block->keeps_count; i++) {
-                       instr_cp(&ctx, block->keeps[i]);
-                       block->keeps[i] = eliminate_output_mov(block->keeps[i]);
-               }
-       }
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c

deleted file mode 100644 (file)

index 73bf5e1..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Instruction Depth:
- *
- * Calculates weighted instruction depth, ie. the sum of # of needed
- * instructions plus delay slots back to original input (ie INPUT or
- * CONST).  That is to say, an instructions depth is:
- *
- *   depth(instr) {
- *     d = 0;
- *     // for each src register:
- *     foreach (src in instr->regs[1..n])
- *       d = max(d, delayslots(src->instr, n) + depth(src->instr));
- *     return d + 1;
- *   }
- *
- * After an instruction's depth is calculated, it is inserted into the
- * blocks depth sorted list, which is used by the scheduling pass.
- */
-
-/* generally don't count false dependencies, since this can just be
- * something like a barrier, or SSBO store.  The exception is array
- * dependencies if the assigner is an array write and the consumer
- * reads the same array.
- */
-static bool
-ignore_dep(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
-{
-       if (!__is_false_dep(consumer, n))
-               return false;
-
-       if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
-               struct ir3_register *dst = assigner->regs[0];
-               struct ir3_register *src;
-
-               debug_assert(dst->flags & IR3_REG_ARRAY);
-
-               foreach_src(src, consumer) {
-                       if ((src->flags & IR3_REG_ARRAY) &&
-                                       (dst->array.id == src->array.id)) {
-                               return false;
-                       }
-               }
-       }
-
-       return true;
-}
-
-/* calculate required # of delay slots between the instruction that
- * assigns a value and the one that consumes
- */
-int ir3_delayslots(struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer, unsigned n)
-{
-       if (ignore_dep(assigner, consumer, n))
-               return 0;
-
-       /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
-        * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
-        * handled with sync bits
-        */
-
-       if (is_meta(assigner))
-               return 0;
-
-       if (writes_addr(assigner))
-               return 6;
-
-       /* handled via sync flags: */
-       if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
-               return 0;
-
-       /* assigner must be alu: */
-       if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
-                       is_mem(consumer)) {
-               return 6;
-       } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-                       (n == 3)) {
-               /* special case, 3rd src to cat3 not required on first cycle */
-               return 1;
-       } else {
-               return 3;
-       }
-}
-
-void
-ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
-{
-       /* remove from existing spot in list: */
-       list_delinit(&instr->node);
-
-       /* find where to re-insert instruction: */
-       list_for_each_entry (struct ir3_instruction, pos, list, node) {
-               if (pos->depth > instr->depth) {
-                       list_add(&instr->node, &pos->node);
-                       return;
-               }
-       }
-       /* if we get here, we didn't find an insertion spot: */
-       list_addtail(&instr->node, list);
-}
-
-static void
-ir3_instr_depth(struct ir3_instruction *instr, unsigned boost, bool falsedep)
-{
-       struct ir3_instruction *src;
-
-       /* don't mark falsedep's as used, but otherwise process them normally: */
-       if (!falsedep)
-               instr->flags &= ~IR3_INSTR_UNUSED;
-
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       instr->depth = 0;
-
-       foreach_ssa_src_n(src, i, instr) {
-               unsigned sd;
-
-               /* visit child to compute it's depth: */
-               ir3_instr_depth(src, boost, __is_false_dep(instr, i));
-
-               /* for array writes, no need to delay on previous write: */
-               if (i == 0)
-                       continue;
-
-               sd = ir3_delayslots(src, instr, i) + src->depth;
-               sd += boost;
-
-               instr->depth = MAX2(instr->depth, sd);
-       }
-
-       if (!is_meta(instr))
-               instr->depth++;
-
-       ir3_insert_by_depth(instr, &instr->block->instr_list);
-}
-
-static bool
-remove_unused_by_block(struct ir3_block *block)
-{
-       bool progress = false;
-       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-               if (instr->opc == OPC_END)
-                       continue;
-               if (instr->flags & IR3_INSTR_UNUSED) {
-                       list_delinit(&instr->node);
-                       progress = true;
-               }
-       }
-       return progress;
-}
-
-static bool
-compute_depth_and_remove_unused(struct ir3 *ir)
-{
-       unsigned i;
-       bool progress = false;
-
-       ir3_clear_mark(ir);
-
-       /* initially mark everything as unused, we'll clear the flag as we
-        * visit the instructions:
-        */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       instr->flags |= IR3_INSTR_UNUSED;
-               }
-       }
-
-       for (i = 0; i < ir->noutputs; i++)
-               if (ir->outputs[i])
-                       ir3_instr_depth(ir->outputs[i], 0, false);
-
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               for (i = 0; i < block->keeps_count; i++)
-                       ir3_instr_depth(block->keeps[i], 0, false);
-
-               /* We also need to account for if-condition: */
-               if (block->condition)
-                       ir3_instr_depth(block->condition, 6, false);
-       }
-
-       /* mark un-used instructions: */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               progress |= remove_unused_by_block(block);
-       }
-
-       /* note that we can end up with unused indirects, but we should
-        * not end up with unused predicates.
-        */
-       for (i = 0; i < ir->indirects_count; i++) {
-               struct ir3_instruction *instr = ir->indirects[i];
-               if (instr && (instr->flags & IR3_INSTR_UNUSED))
-                       ir->indirects[i] = NULL;
-       }
-
-       /* cleanup unused inputs: */
-       for (i = 0; i < ir->ninputs; i++) {
-               struct ir3_instruction *in = ir->inputs[i];
-               if (in && (in->flags & IR3_INSTR_UNUSED))
-                       ir->inputs[i] = NULL;
-       }
-
-       return progress;
-}
-
-void
-ir3_depth(struct ir3 *ir)
-{
-       bool progress;
-       do {
-               progress = compute_depth_and_remove_unused(ir);
-       } while (progress);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c

index 3a1b857e0107fb14bf64c0cc58570d2fc37374e4..cc6efa1ca17853e83826ee217291f2e52bcef6ef 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -37,10 +37,10 @@
  #include "freedreno_context.h"
  #include "freedreno_util.h"
  
-#include "ir3_shader.h"
-#include "ir3_gallium.h"
-#include "ir3_compiler.h"
-#include "ir3_nir.h"
+#include "ir3/ir3_shader.h"
+#include "ir3/ir3_gallium.h"
+#include "ir3/ir3_compiler.h"
+#include "ir3/ir3_nir.h"
  
  static void
  dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h

index cf1d48d97baadbb79324c14b6dc8f33fa44252a0..5fb745967817d71f2a99d93c1c211486d9ca387a 100644 (file)
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h
@@ -28,7 +28,7 @@
  #define IR3_GALLIUM_H_
  
  #include "pipe/p_state.h"
-#include "ir3_shader.h"
+#include "ir3/ir3_shader.h"
  
  struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler,
                 const struct pipe_shader_state *cso, gl_shader_stage type,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c

deleted file mode 100644 (file)

index 5700559..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "ir3.h"
-
-/*
- * Find/group instruction neighbors:
- */
-
-/* bleh.. we need to do the same group_n() thing for both inputs/outputs
- * (where we have a simple instr[] array), and fanin nodes (where we have
- * an extra indirection via reg->instr).
- */
-struct group_ops {
-       struct ir3_instruction *(*get)(void *arr, int idx);
-       void (*insert_mov)(void *arr, int idx, struct ir3_instruction *instr);
-};
-
-static struct ir3_instruction *arr_get(void *arr, int idx)
-{
-       return ((struct ir3_instruction **)arr)[idx];
-}
-static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
-{
-       ((struct ir3_instruction **)arr)[idx] =
-                       ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
-{
-       /* so, we can't insert a mov in front of a meta:in.. and the downstream
-        * instruction already has a pointer to 'instr'.  So we cheat a bit and
-        * morph the meta:in instruction into a mov and insert a new meta:in
-        * in front.
-        */
-       struct ir3_instruction *in;
-
-       debug_assert(instr->regs_count == 1);
-
-       in = ir3_instr_create(instr->block, OPC_META_INPUT);
-       in->inout.block = instr->block;
-       ir3_reg_create(in, instr->regs[0]->num, 0);
-
-       /* create src reg for meta:in and fixup to now be a mov: */
-       ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = in;
-       instr->opc = OPC_MOV;
-       instr->cat1.src_type = TYPE_F32;
-       instr->cat1.dst_type = TYPE_F32;
-
-       ((struct ir3_instruction **)arr)[idx] = in;
-}
-static struct group_ops arr_ops_out = { arr_get, arr_insert_mov_out };
-static struct group_ops arr_ops_in = { arr_get, arr_insert_mov_in };
-
-static struct ir3_instruction *instr_get(void *arr, int idx)
-{
-       return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
-}
-static void
-instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
-{
-       ((struct ir3_instruction *)arr)->regs[idx+1]->instr =
-                       ir3_MOV(instr->block, instr, TYPE_F32);
-}
-static struct group_ops instr_ops = { instr_get, instr_insert_mov };
-
-/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
-static bool
-in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
-{
-       int idx = 0;
-
-       if (!instr)
-               return false;
-
-       if (instr == cur)
-               return true;
-
-       for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
-               if ((idx++ != pos) && (instr == cur))
-                       return true;
-
-       return false;
-}
-
-static void
-group_n(struct group_ops *ops, void *arr, unsigned n)
-{
-       unsigned i, j;
-
-       /* first pass, figure out what has conflicts and needs a mov
-        * inserted.  Do this up front, before starting to setup
-        * left/right neighbor pointers.  Trying to do it in a single
-        * pass could result in a situation where we can't even setup
-        * the mov's right neighbor ptr if the next instr also needs
-        * a mov.
-        */
-restart:
-       for (i = 0; i < n; i++) {
-               struct ir3_instruction *instr = ops->get(arr, i);
-               if (instr) {
-                       struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
-                       struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
-                       bool conflict;
-
-                       /* check for left/right neighbor conflicts: */
-                       conflict = conflicts(instr->cp.left, left) ||
-                               conflicts(instr->cp.right, right);
-
-                       /* Mixing array elements and higher register classes
-                        * (ie. groups) doesn't really work out in RA.  See:
-                        *
-                        * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
-                        */
-                       if (instr->regs[0]->flags & IR3_REG_ARRAY)
-                               conflict = true;
-
-                       /* we also can't have an instr twice in the group: */
-                       for (j = i + 1; (j < n) && !conflict; j++)
-                               if (in_neighbor_list(ops->get(arr, j), instr, i))
-                                       conflict = true;
-
-                       if (conflict) {
-                               ops->insert_mov(arr, i, instr);
-                               /* inserting the mov may have caused a conflict
-                                * against the previous:
-                                */
-                               goto restart;
-                       }
-               }
-       }
-
-       /* second pass, now that we've inserted mov's, fixup left/right
-        * neighbors.  This is guaranteed to succeed, since by definition
-        * the newly inserted mov's cannot conflict with anything.
-        */
-       for (i = 0; i < n; i++) {
-               struct ir3_instruction *instr = ops->get(arr, i);
-               if (instr) {
-                       struct ir3_instruction *left = (i > 0) ? ops->get(arr, i - 1) : NULL;
-                       struct ir3_instruction *right = (i < (n-1)) ? ops->get(arr, i + 1) : NULL;
-
-                       debug_assert(!conflicts(instr->cp.left, left));
-                       if (left) {
-                               instr->cp.left_cnt++;
-                               instr->cp.left = left;
-                       }
-
-                       debug_assert(!conflicts(instr->cp.right, right));
-                       if (right) {
-                               instr->cp.right_cnt++;
-                               instr->cp.right = right;
-                       }
-               }
-       }
-}
-
-static void
-instr_find_neighbors(struct ir3_instruction *instr)
-{
-       struct ir3_instruction *src;
-
-       if (ir3_instr_check_mark(instr))
-               return;
-
-       if (instr->opc == OPC_META_FI)
-               group_n(&instr_ops, instr, instr->regs_count - 1);
-
-       foreach_ssa_src(src, instr)
-               instr_find_neighbors(src);
-}
-
-/* a bit of sadness.. we can't have "holes" in inputs from PoV of
- * register assignment, they still need to be grouped together.  So
- * we need to insert dummy/padding instruction for grouping, and
- * then take it back out again before anyone notices.
- */
-static void
-pad_and_group_input(struct ir3_instruction **input, unsigned n)
-{
-       int i, mask = 0;
-       struct ir3_block *block = NULL;
-
-       for (i = n - 1; i >= 0; i--) {
-               struct ir3_instruction *instr = input[i];
-               if (instr) {
-                       block = instr->block;
-               } else if (block) {
-                       instr = ir3_NOP(block);
-                       ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
-                       input[i] = instr;
-                       mask |= (1 << i);
-               }
-       }
-
-       group_n(&arr_ops_in, input, n);
-
-       for (i = 0; i < n; i++) {
-               if (mask & (1 << i))
-                       input[i] = NULL;
-       }
-}
-
-static void
-find_neighbors(struct ir3 *ir)
-{
-       unsigned i;
-
-       /* shader inputs/outputs themselves must be contiguous as well:
-        *
-        * NOTE: group inputs first, since we only insert mov's
-        * *before* the conflicted instr (and that would go badly
-        * for inputs).  By doing inputs first, we should never
-        * have a conflict on inputs.. pushing any conflict to
-        * resolve to the outputs, for stuff like:
-        *
-        *     MOV OUT[n], IN[m].wzyx
-        *
-        * NOTE: we assume here inputs/outputs are grouped in vec4.
-        * This logic won't quite cut it if we don't align smaller
-        * on vec4 boundaries
-        */
-       for (i = 0; i < ir->ninputs; i += 4)
-               pad_and_group_input(&ir->inputs[i], 4);
-       for (i = 0; i < ir->noutputs; i += 4)
-               group_n(&arr_ops_out, &ir->outputs[i], 4);
-
-       for (i = 0; i < ir->noutputs; i++) {
-               if (ir->outputs[i]) {
-                       struct ir3_instruction *instr = ir->outputs[i];
-                       instr_find_neighbors(instr);
-               }
-       }
-
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               for (i = 0; i < block->keeps_count; i++) {
-                       struct ir3_instruction *instr = block->keeps[i];
-                       instr_find_neighbors(instr);
-               }
-
-               /* We also need to account for if-condition: */
-               if (block->condition)
-                       instr_find_neighbors(block->condition);
-       }
-}
-
-void
-ir3_group(struct ir3 *ir)
-{
-       ir3_clear_mark(ir);
-       find_neighbors(ir);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c

deleted file mode 100644 (file)

index ff4c644..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/ralloc.h"
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Legalize:
- *
- * We currently require that scheduling ensures that we have enough nop's
- * in all the right places.  The legalize step mostly handles fixing up
- * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
- * into fewer nop's w/ rpt flag.
- */
-
-struct ir3_legalize_ctx {
-       int num_samp;
-       bool has_ssbo;
-       int max_bary;
-};
-
-struct ir3_legalize_state {
-       regmask_t needs_ss;
-       regmask_t needs_ss_war;       /* write after read */
-       regmask_t needs_sy;
-};
-
-struct ir3_legalize_block_data {
-       bool valid;
-       struct ir3_legalize_state state;
-};
-
-/* We want to evaluate each block from the position of any other
- * predecessor block, in order that the flags set are the union of
- * all possible program paths.
- *
- * To do this, we need to know the output state (needs_ss/ss_war/sy)
- * of all predecessor blocks.  The tricky thing is loops, which mean
- * that we can't simply recursively process each predecessor block
- * before legalizing the current block.
- *
- * How we handle that is by looping over all the blocks until the
- * results converge.  If the output state of a given block changes
- * in a given pass, this means that all successor blocks are not
- * yet fully legalized.
- */
-
-static bool
-legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_legalize_block_data *bd = block->data;
-
-       if (bd->valid)
-               return false;
-
-       struct ir3_instruction *last_input = NULL;
-       struct ir3_instruction *last_rel = NULL;
-       struct ir3_instruction *last_n = NULL;
-       struct list_head instr_list;
-       struct ir3_legalize_state prev_state = bd->state;
-       struct ir3_legalize_state *state = &bd->state;
-
-       /* our input state is the OR of all predecessor blocks' state: */
-       for (unsigned i = 0; i < block->predecessors_count; i++) {
-               struct ir3_legalize_block_data *pbd = block->predecessors[i]->data;
-               struct ir3_legalize_state *pstate = &pbd->state;
-
-               /* Our input (ss)/(sy) state is based on OR'ing the output
-                * state of all our predecessor blocks
-                */
-               regmask_or(&state->needs_ss,
-                               &state->needs_ss, &pstate->needs_ss);
-               regmask_or(&state->needs_ss_war,
-                               &state->needs_ss_war, &pstate->needs_ss_war);
-               regmask_or(&state->needs_sy,
-                               &state->needs_sy, &pstate->needs_sy);
-       }
-
-       /* remove all the instructions from the list, we'll be adding
-        * them back in as we go
-        */
-       list_replace(&block->instr_list, &instr_list);
-       list_inithead(&block->instr_list);
-
-       list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
-               struct ir3_register *reg;
-               unsigned i;
-
-               n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
-
-               if (is_meta(n))
-                       continue;
-
-               if (is_input(n)) {
-                       struct ir3_register *inloc = n->regs[1];
-                       assert(inloc->flags & IR3_REG_IMMED);
-                       ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
-               }
-
-               if (last_n && is_barrier(last_n))
-                       n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-
-               /* NOTE: consider dst register too.. it could happen that
-                * texture sample instruction (for example) writes some
-                * components which are unused.  A subsequent instruction
-                * that writes the same register can race w/ the sam instr
-                * resulting in undefined results:
-                */
-               for (i = 0; i < n->regs_count; i++) {
-                       reg = n->regs[i];
-
-                       if (reg_gpr(reg)) {
-
-                               /* TODO: we probably only need (ss) for alu
-                                * instr consuming sfu result.. need to make
-                                * some tests for both this and (sy)..
-                                */
-                               if (regmask_get(&state->needs_ss, reg)) {
-                                       n->flags |= IR3_INSTR_SS;
-                                       regmask_init(&state->needs_ss_war);
-                                       regmask_init(&state->needs_ss);
-                               }
-
-                               if (regmask_get(&state->needs_sy, reg)) {
-                                       n->flags |= IR3_INSTR_SY;
-                                       regmask_init(&state->needs_sy);
-                               }
-                       }
-
-                       /* TODO: is it valid to have address reg loaded from a
-                        * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
-                        * last_rel check below should be moved ahead of this:
-                        */
-                       if (reg->flags & IR3_REG_RELATIV)
-                               last_rel = n;
-               }
-
-               if (n->regs_count > 0) {
-                       reg = n->regs[0];
-                       if (regmask_get(&state->needs_ss_war, reg)) {
-                               n->flags |= IR3_INSTR_SS;
-                               regmask_init(&state->needs_ss_war);
-                               regmask_init(&state->needs_ss);
-                       }
-
-                       if (last_rel && (reg->num == regid(REG_A0, 0))) {
-                               last_rel->flags |= IR3_INSTR_UL;
-                               last_rel = NULL;
-                       }
-               }
-
-               /* cat5+ does not have an (ss) bit, if needed we need to
-                * insert a nop to carry the sync flag.  Would be kinda
-                * clever if we were aware of this during scheduling, but
-                * this should be a pretty rare case:
-                */
-               if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
-                       struct ir3_instruction *nop;
-                       nop = ir3_NOP(block);
-                       nop->flags |= IR3_INSTR_SS;
-                       n->flags &= ~IR3_INSTR_SS;
-               }
-
-               /* need to be able to set (ss) on first instruction: */
-               if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
-                       ir3_NOP(block);
-
-               if (is_nop(n) && !list_empty(&block->instr_list)) {
-                       struct ir3_instruction *last = list_last_entry(&block->instr_list,
-                                       struct ir3_instruction, node);
-                       if (is_nop(last) && (last->repeat < 5)) {
-                               last->repeat++;
-                               last->flags |= n->flags;
-                               continue;
-                       }
-               }
-
-               list_addtail(&n->node, &block->instr_list);
-
-               if (is_sfu(n))
-                       regmask_set(&state->needs_ss, n->regs[0]);
-
-               if (is_tex(n)) {
-                       /* this ends up being the # of samp instructions.. but that
-                        * is ok, everything else only cares whether it is zero or
-                        * not.  We do this here, rather than when we encounter a
-                        * SAMP decl, because (especially in binning pass shader)
-                        * the samp instruction(s) could get eliminated if the
-                        * result is not used.
-                        */
-                       ctx->num_samp = MAX2(ctx->num_samp, n->cat5.samp + 1);
-                       regmask_set(&state->needs_sy, n->regs[0]);
-               } else if (n->opc == OPC_RESINFO) {
-                       regmask_set(&state->needs_ss, n->regs[0]);
-                       ir3_NOP(block)->flags |= IR3_INSTR_SS;
-               } else if (is_load(n)) {
-                       /* seems like ldlv needs (ss) bit instead??  which is odd but
-                        * makes a bunch of flat-varying tests start working on a4xx.
-                        */
-                       if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
-                               regmask_set(&state->needs_ss, n->regs[0]);
-                       else
-                               regmask_set(&state->needs_sy, n->regs[0]);
-               } else if (is_atomic(n->opc)) {
-                       if (n->flags & IR3_INSTR_G)
-                               regmask_set(&state->needs_sy, n->regs[0]);
-                       else
-                               regmask_set(&state->needs_ss, n->regs[0]);
-               }
-
-               if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
-                       ctx->has_ssbo = true;
-
-               /* both tex/sfu appear to not always immediately consume
-                * their src register(s):
-                */
-               if (is_tex(n) || is_sfu(n) || is_mem(n)) {
-                       foreach_src(reg, n) {
-                               if (reg_gpr(reg))
-                                       regmask_set(&state->needs_ss_war, reg);
-                       }
-               }
-
-               if (is_input(n))
-                       last_input = n;
-
-               last_n = n;
-       }
-
-       if (last_input) {
-               /* special hack.. if using ldlv to bypass interpolation,
-                * we need to insert a dummy bary.f on which we can set
-                * the (ei) flag:
-                */
-               if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
-                       struct ir3_instruction *baryf;
-
-                       /* (ss)bary.f (ei)r63.x, 0, r0.x */
-                       baryf = ir3_instr_create(block, OPC_BARY_F);
-                       baryf->flags |= IR3_INSTR_SS;
-                       ir3_reg_create(baryf, regid(63, 0), 0);
-                       ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
-                       ir3_reg_create(baryf, regid(0, 0), 0);
-
-                       /* insert the dummy bary.f after last_input: */
-                       list_delinit(&baryf->node);
-                       list_add(&baryf->node, &last_input->node);
-
-                       last_input = baryf;
-               }
-               last_input->regs[0]->flags |= IR3_REG_EI;
-       }
-
-       if (last_rel)
-               last_rel->flags |= IR3_INSTR_UL;
-
-       bd->valid = true;
-
-       if (memcmp(&prev_state, state, sizeof(*state))) {
-               /* our output state changed, this invalidates all of our
-                * successors:
-                */
-               for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
-                       if (!block->successors[i])
-                               break;
-                       struct ir3_legalize_block_data *pbd = block->successors[i]->data;
-                       pbd->valid = false;
-               }
-       }
-
-       return true;
-}
-
-/* NOTE: branch instructions are always the last instruction(s)
- * in the block.  We take advantage of this as we resolve the
- * branches, since "if (foo) break;" constructs turn into
- * something like:
- *
- *   block3 {
- *     ...
- *     0029:021: mov.s32s32 r62.x, r1.y
- *     0082:022: br !p0.x, target=block5
- *     0083:023: br p0.x, target=block4
- *     // succs: if _[0029:021: mov.s32s32] block4; else block5;
- *   }
- *   block4 {
- *     0084:024: jump, target=block6
- *     // succs: block6;
- *   }
- *   block5 {
- *     0085:025: jump, target=block7
- *     // succs: block7;
- *   }
- *
- * ie. only instruction in block4/block5 is a jump, so when
- * resolving branches we can easily detect this by checking
- * that the first instruction in the target block is itself
- * a jump, and setup the br directly to the jump's target
- * (and strip back out the now unreached jump)
- *
- * TODO sometimes we end up with things like:
- *
- *    br !p0.x, #2
- *    br p0.x, #12
- *    add.u r0.y, r0.y, 1
- *
- * If we swapped the order of the branches, we could drop one.
- */
-static struct ir3_block *
-resolve_dest_block(struct ir3_block *block)
-{
-       /* special case for last block: */
-       if (!block->successors[0])
-               return block;
-
-       /* NOTE that we may or may not have inserted the jump
-        * in the target block yet, so conditions to resolve
-        * the dest to the dest block's successor are:
-        *
-        *   (1) successor[1] == NULL &&
-        *   (2) (block-is-empty || only-instr-is-jump)
-        */
-       if (block->successors[1] == NULL) {
-               if (list_empty(&block->instr_list)) {
-                       return block->successors[0];
-               } else if (list_length(&block->instr_list) == 1) {
-                       struct ir3_instruction *instr = list_first_entry(
-                                       &block->instr_list, struct ir3_instruction, node);
-                       if (instr->opc == OPC_JUMP)
-                               return block->successors[0];
-               }
-       }
-       return block;
-}
-
-static bool
-resolve_jump(struct ir3_instruction *instr)
-{
-       struct ir3_block *tblock =
-               resolve_dest_block(instr->cat0.target);
-       struct ir3_instruction *target;
-
-       if (tblock != instr->cat0.target) {
-               list_delinit(&instr->cat0.target->node);
-               instr->cat0.target = tblock;
-               return true;
-       }
-
-       target = list_first_entry(&tblock->instr_list,
-                               struct ir3_instruction, node);
-
-       /* TODO maybe a less fragile way to do this.  But we are expecting
-        * a pattern from sched_block() that looks like:
-        *
-        *   br !p0.x, #else-block
-        *   br p0.x, #if-block
-        *
-        * if the first branch target is +2, or if 2nd branch target is +1
-        * then we can just drop the jump.
-        */
-       unsigned next_block;
-       if (instr->cat0.inv == true)
-               next_block = 2;
-       else
-               next_block = 1;
-
-       if ((!target) || (target->ip == (instr->ip + next_block))) {
-               list_delinit(&instr->node);
-               return true;
-       } else {
-               instr->cat0.immed =
-                       (int)target->ip - (int)instr->ip;
-       }
-       return false;
-}
-
-/* resolve jumps, removing jumps/branches to immediately following
- * instruction which we end up with from earlier stages.  Since
- * removing an instruction can invalidate earlier instruction's
- * branch offsets, we need to do this iteratively until no more
- * branches are removed.
- */
-static bool
-resolve_jumps(struct ir3 *ir)
-{
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
-                       if (is_flow(instr) && instr->cat0.target)
-                               if (resolve_jump(instr))
-                                       return true;
-
-       return false;
-}
-
-/* we want to mark points where divergent flow control re-converges
- * with (jp) flags.  For now, since we don't do any optimization for
- * things that start out as a 'do {} while()', re-convergence points
- * will always be a branch or jump target.  Note that this is overly
- * conservative, since unconditional jump targets are not convergence
- * points, we are just assuming that the other path to reach the jump
- * target was divergent.  If we were clever enough to optimize the
- * jump at end of a loop back to a conditional branch into a single
- * conditional branch, ie. like:
- *
- *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
- *    mul.f r1.z, r1.z, r0.x
- *    mul.f r1.y, r1.y, r0.x
- *    mul.f r0.z, r1.x, r0.x
- *    mul.f r0.w, r0.y, r0.x
- *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
- *    add.s r0.x, (r)r0.x, (r)-1
- *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
- *    cmps.f.eq p0.x, r0.x, c3.y
- *    mov.f32f32 r0.x, r1.w
- *    mov.f32f32 r0.y, r0.w
- *    mov.f32f32 r1.x, r0.z
- *    (rpt2)nop
- *    br !p0.x, #-13
- *    (jp)mul.f r0.x, c263.y, r1.y
- *
- * Then we'd have to be more clever, as the convergence point is no
- * longer a branch or jump target.
- */
-static void
-mark_convergence_points(struct ir3 *ir)
-{
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-                       if (is_flow(instr) && instr->cat0.target) {
-                               struct ir3_instruction *target =
-                                       list_first_entry(&instr->cat0.target->instr_list,
-                                                       struct ir3_instruction, node);
-                               target->flags |= IR3_INSTR_JP;
-                       }
-               }
-       }
-}
-
-void
-ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary)
-{
-       struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
-       bool progress;
-
-       ctx->max_bary = -1;
-
-       /* allocate per-block data: */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               block->data = rzalloc(ctx, struct ir3_legalize_block_data);
-       }
-
-       /* process each block: */
-       do {
-               progress = false;
-               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-                       progress |= legalize_block(ctx, block);
-               }
-       } while (progress);
-
-       *num_samp = ctx->num_samp;
-       *has_ssbo = ctx->has_ssbo;
-       *max_bary = ctx->max_bary;
-
-       do {
-               ir3_count_instructions(ir);
-       } while(resolve_jumps(ir));
-
-       mark_convergence_points(ir);
-
-       ralloc_free(ctx);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c

deleted file mode 100644 (file)

index 70c01ee..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-
-#include "util/debug.h"
-
-#include "ir3_nir.h"
-#include "ir3_compiler.h"
-#include "ir3_shader.h"
-
-static const nir_shader_compiler_options options = {
-               .lower_fpow = true,
-               .lower_scmp = true,
-               .lower_flrp32 = true,
-               .lower_flrp64 = true,
-               .lower_ffract = true,
-               .lower_fmod32 = true,
-               .lower_fmod64 = true,
-               .lower_fdiv = true,
-               .lower_ldexp = true,
-               .fuse_ffma = true,
-               .native_integers = true,
-               .vertex_id_zero_based = true,
-               .lower_extract_byte = true,
-               .lower_extract_word = true,
-               .lower_all_io_to_temps = true,
-               .lower_helper_invocation = true,
-};
-
-const nir_shader_compiler_options *
-ir3_get_compiler_options(struct ir3_compiler *compiler)
-{
-       return &options;
-}
-
-/* for given shader key, are any steps handled in nir? */
-bool
-ir3_key_lowers_nir(const struct ir3_shader_key *key)
-{
-       return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
-                       key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
-                       key->ucp_enables | key->color_two_side |
-                       key->fclamp_color | key->vclamp_color;
-}
-
-#define OPT(nir, pass, ...) ({                             \
-   bool this_progress = false;                             \
-   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
-   this_progress;                                          \
-})
-
-#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
-
-static void
-ir3_optimize_loop(nir_shader *s)
-{
-       bool progress;
-       do {
-               progress = false;
-
-               OPT_V(s, nir_lower_vars_to_ssa);
-               progress |= OPT(s, nir_opt_copy_prop_vars);
-               progress |= OPT(s, nir_opt_dead_write_vars);
-               progress |= OPT(s, nir_lower_alu_to_scalar);
-               progress |= OPT(s, nir_lower_phis_to_scalar);
-
-               progress |= OPT(s, nir_copy_prop);
-               progress |= OPT(s, nir_opt_dce);
-               progress |= OPT(s, nir_opt_cse);
-               static int gcm = -1;
-               if (gcm == -1)
-                       gcm = env_var_as_unsigned("GCM", 0);
-               if (gcm == 1)
-                       progress |= OPT(s, nir_opt_gcm, true);
-               else if (gcm == 2)
-                       progress |= OPT(s, nir_opt_gcm, false);
-               progress |= OPT(s, nir_opt_peephole_select, 16);
-               progress |= OPT(s, nir_opt_intrinsics);
-               progress |= OPT(s, nir_opt_algebraic);
-               progress |= OPT(s, nir_opt_constant_folding);
-               progress |= OPT(s, nir_opt_dead_cf);
-               if (OPT(s, nir_opt_trivial_continues)) {
-                       progress |= true;
-                       /* If nir_opt_trivial_continues makes progress, then we need to clean
-                        * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
-                        * to make progress.
-                        */
-                       OPT(s, nir_copy_prop);
-                       OPT(s, nir_opt_dce);
-               }
-               progress |= OPT(s, nir_opt_if);
-               progress |= OPT(s, nir_opt_remove_phis);
-               progress |= OPT(s, nir_opt_undef);
-
-       } while (progress);
-}
-
-struct nir_shader *
-ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
-               const struct ir3_shader_key *key)
-{
-       struct nir_lower_tex_options tex_options = {
-                       .lower_rect = 0,
-       };
-
-       if (key) {
-               switch (shader->type) {
-               case MESA_SHADER_FRAGMENT:
-                       tex_options.saturate_s = key->fsaturate_s;
-                       tex_options.saturate_t = key->fsaturate_t;
-                       tex_options.saturate_r = key->fsaturate_r;
-                       break;
-               case MESA_SHADER_VERTEX:
-                       tex_options.saturate_s = key->vsaturate_s;
-                       tex_options.saturate_t = key->vsaturate_t;
-                       tex_options.saturate_r = key->vsaturate_r;
-                       break;
-               default:
-                       /* TODO */
-                       break;
-               }
-       }
-
-       if (shader->compiler->gpu_id >= 400) {
-               /* a4xx seems to have *no* sam.p */
-               tex_options.lower_txp = ~0;  /* lower all txp */
-       } else {
-               /* a3xx just needs to avoid sam.p for 3d tex */
-               tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
-       }
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               debug_printf("----------------------\n");
-               nir_print_shader(s, stdout);
-               debug_printf("----------------------\n");
-       }
-
-       OPT_V(s, nir_opt_global_to_local);
-       OPT_V(s, nir_lower_regs_to_ssa);
-
-       if (key) {
-               if (s->info.stage == MESA_SHADER_VERTEX) {
-                       OPT_V(s, nir_lower_clip_vs, key->ucp_enables, false);
-                       if (key->vclamp_color)
-                               OPT_V(s, nir_lower_clamp_color_outputs);
-               } else if (s->info.stage == MESA_SHADER_FRAGMENT) {
-                       OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
-                       if (key->fclamp_color)
-                               OPT_V(s, nir_lower_clamp_color_outputs);
-               }
-               if (key->color_two_side) {
-                       OPT_V(s, nir_lower_two_sided_color);
-               }
-       } else {
-               /* only want to do this the first time (when key is null)
-                * and not again on any potential 2nd variant lowering pass:
-                */
-               OPT_V(s, ir3_nir_apply_trig_workarounds);
-       }
-
-       OPT_V(s, nir_lower_tex, &tex_options);
-       OPT_V(s, nir_lower_load_const_to_scalar);
-       if (shader->compiler->gpu_id < 500)
-               OPT_V(s, ir3_nir_lower_tg4_to_tex);
-
-       ir3_optimize_loop(s);
-
-       /* do idiv lowering after first opt loop to give a chance for
-        * divide by immed power-of-two to be caught first:
-        */
-       if (OPT(s, nir_lower_idiv))
-               ir3_optimize_loop(s);
-
-       OPT_V(s, nir_remove_dead_variables, nir_var_local);
-
-       OPT_V(s, nir_move_load_const);
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               debug_printf("----------------------\n");
-               nir_print_shader(s, stdout);
-               debug_printf("----------------------\n");
-       }
-
-       nir_sweep(s);
-
-       return s;
-}
-
-void
-ir3_nir_scan_driver_consts(nir_shader *shader,
-               struct ir3_driver_const_layout *layout)
-{
-       nir_foreach_function(function, shader) {
-               if (!function->impl)
-                       continue;
-
-               nir_foreach_block(block, function->impl) {
-                       nir_foreach_instr(instr, block) {
-                               if (instr->type != nir_instr_type_intrinsic)
-                                       continue;
-
-                               nir_intrinsic_instr *intr =
-                                       nir_instr_as_intrinsic(instr);
-                               unsigned idx;
-
-                               switch (intr->intrinsic) {
-                               case nir_intrinsic_get_buffer_size:
-                                       idx = nir_src_as_const_value(intr->src[0])->u32[0];
-                                       if (layout->ssbo_size.mask & (1 << idx))
-                                               break;
-                                       layout->ssbo_size.mask |= (1 << idx);
-                                       layout->ssbo_size.off[idx] =
-                                               layout->ssbo_size.count;
-                                       layout->ssbo_size.count += 1; /* one const per */
-                                       break;
-                               case nir_intrinsic_image_deref_atomic_add:
-                               case nir_intrinsic_image_deref_atomic_min:
-                               case nir_intrinsic_image_deref_atomic_max:
-                               case nir_intrinsic_image_deref_atomic_and:
-                               case nir_intrinsic_image_deref_atomic_or:
-                               case nir_intrinsic_image_deref_atomic_xor:
-                               case nir_intrinsic_image_deref_atomic_exchange:
-                               case nir_intrinsic_image_deref_atomic_comp_swap:
-                               case nir_intrinsic_image_deref_store:
-                               case nir_intrinsic_image_deref_size:
-                                       idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
-                                       if (layout->image_dims.mask & (1 << idx))
-                                               break;
-                                       layout->image_dims.mask |= (1 << idx);
-                                       layout->image_dims.off[idx] =
-                                               layout->image_dims.count;
-                                       layout->image_dims.count += 3; /* three const per */
-                                       break;
-                               default:
-                                       break;
-                               }
-                       }
-               }
-       }
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/src/gallium/drivers/freedreno/ir3/ir3_nir.h

deleted file mode 100644 (file)

index 74201d3..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef IR3_NIR_H_
-#define IR3_NIR_H_
-
-#include "compiler/nir/nir.h"
-#include "compiler/shader_enums.h"
-
-#include "ir3_shader.h"
-
-void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layout *layout);
-
-bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
-bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
-
-const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
-bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
-struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
-               const struct ir3_shader_key *key);
-
-#endif /* IR3_NIR_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c

deleted file mode 100644 (file)

index 37a3dcb..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright © 2017 Ilia Mirkin
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "ir3_nir.h"
-#include "compiler/nir/nir_builder.h"
-
-/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
- * gather results, rather than before. As a result, it must be emulated with
- * direct texture calls.
- */
-
-static bool
-lower_tg4(nir_block *block, nir_builder *b, void *mem_ctx)
-{
-       bool progress = false;
-
-       static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
-
-       nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_tex)
-                       continue;
-
-               nir_tex_instr *tg4 = (nir_tex_instr *)instr;
-
-               if (tg4->op != nir_texop_tg4)
-                       continue;
-
-               b->cursor = nir_before_instr(&tg4->instr);
-
-               nir_ssa_def *results[4];
-               int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
-               for (int i = 0; i < 4; i++) {
-                       int num_srcs = tg4->num_srcs + 1 /* lod */;
-                       if (offset_index < 0 && i < 3)
-                               num_srcs++;
-
-                       nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
-                       tex->op = nir_texop_txl;
-                       tex->sampler_dim = tg4->sampler_dim;
-                       tex->coord_components = tg4->coord_components;
-                       tex->is_array = tg4->is_array;
-                       tex->is_shadow = tg4->is_shadow;
-                       tex->is_new_style_shadow = tg4->is_new_style_shadow;
-                       tex->texture_index = tg4->texture_index;
-                       tex->sampler_index = tg4->sampler_index;
-                       tex->dest_type = tg4->dest_type;
-
-                       for (int j = 0; j < tg4->num_srcs; j++) {
-                               nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
-                               tex->src[j].src_type = tg4->src[j].src_type;
-                       }
-                       if (i != 3) {
-                               nir_ssa_def *offset =
-                                       nir_vec2(b, nir_imm_int(b, offsets[i][0]),
-                                                        nir_imm_int(b, offsets[i][1]));
-                               if (offset_index < 0) {
-                                       tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
-                                       tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
-                               } else {
-                                       assert(nir_tex_instr_src_size(tex, offset_index) == 2);
-                                       nir_ssa_def *orig = nir_ssa_for_src(
-                                                       b, tex->src[offset_index].src, 2);
-                                       tex->src[offset_index].src =
-                                               nir_src_for_ssa(nir_iadd(b, orig, offset));
-                               }
-                       }
-                       tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
-                       tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
-
-                       nir_ssa_dest_init(&tex->instr, &tex->dest,
-                                                         nir_tex_instr_dest_size(tex), 32, NULL);
-                       nir_builder_instr_insert(b, &tex->instr);
-
-                       results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
-               }
-
-               nir_ssa_def *result = nir_vec4(b, results[0], results[1], results[2], results[3]);
-               nir_ssa_def_rewrite_uses(&tg4->dest.ssa, nir_src_for_ssa(result));
-
-               nir_instr_remove(&tg4->instr);
-
-               progress = true;
-       }
-
-       return progress;
-}
-
-static bool
-lower_tg4_func(nir_function_impl *impl)
-{
-       void *mem_ctx = ralloc_parent(impl);
-       nir_builder b;
-       nir_builder_init(&b, impl);
-
-       bool progress = false;
-       nir_foreach_block_safe(block, impl) {
-               progress |= lower_tg4(block, &b, mem_ctx);
-       }
-
-       if (progress)
-               nir_metadata_preserve(impl, nir_metadata_block_index |
-                                                                       nir_metadata_dominance);
-
-       return progress;
-}
-
-bool
-ir3_nir_lower_tg4_to_tex(nir_shader *shader)
-{
-       bool progress = false;
-
-       nir_foreach_function(function, shader) {
-               if (function->impl)
-                       progress |= lower_tg4_func(function->impl);
-       }
-
-       return progress;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py b/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py

deleted file mode 100644 (file)

index 3968aea..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# Copyright (C) 2016 Intel Corporation
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-from __future__ import print_function
-
-import argparse
-import sys
-
-trig_workarounds = [
-   (('fsin', 'x'), ('fsin', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
-   (('fcos', 'x'), ('fcos', ('fsub', ('fmul', 6.283185, ('ffract', ('fadd', ('fmul', 0.159155, 'x'), 0.5))), 3.141593))),
-]
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--import-path', required=True)
-    args = parser.parse_args()
-    sys.path.insert(0, args.import_path)
-    run()
-
-
-def run():
-    import nir_algebraic  # pylint: disable=import-error
-
-    print('#include "ir3_nir.h"')
-    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
-                                      trig_workarounds).render())
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c

deleted file mode 100644 (file)

index b6ef6e4..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-#include <stdio.h>
-
-#include "ir3.h"
-
-#define PTRID(x) ((unsigned long)(x))
-
-static void print_instr_name(struct ir3_instruction *instr)
-{
-       if (!instr)
-               return;
-#ifdef DEBUG
-       printf("%04u:", instr->serialno);
-#endif
-       printf("%04u:", instr->name);
-       printf("%04u:", instr->ip);
-       printf("%03u: ", instr->depth);
-
-       if (instr->flags & IR3_INSTR_SY)
-               printf("(sy)");
-       if (instr->flags & IR3_INSTR_SS)
-               printf("(ss)");
-
-       if (is_meta(instr)) {
-               switch (instr->opc) {
-               case OPC_META_INPUT:  printf("_meta:in");   break;
-               case OPC_META_FO:     printf("_meta:fo");   break;
-               case OPC_META_FI:     printf("_meta:fi");   break;
-
-               /* shouldn't hit here.. just for debugging: */
-               default: printf("_meta:%d", instr->opc);    break;
-               }
-       } else if (instr->opc == OPC_MOV) {
-               static const char *type[] = {
-                               [TYPE_F16] = "f16",
-                               [TYPE_F32] = "f32",
-                               [TYPE_U16] = "u16",
-                               [TYPE_U32] = "u32",
-                               [TYPE_S16] = "s16",
-                               [TYPE_S32] = "s32",
-                               [TYPE_U8]  = "u8",
-                               [TYPE_S8]  = "s8",
-               };
-               if (instr->cat1.src_type == instr->cat1.dst_type)
-                       printf("mov");
-               else
-                       printf("cov");
-               printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
-       } else {
-               printf("%s", ir3_instr_name(instr));
-               if (instr->flags & IR3_INSTR_3D)
-                       printf(".3d");
-               if (instr->flags & IR3_INSTR_A)
-                       printf(".a");
-               if (instr->flags & IR3_INSTR_O)
-                       printf(".o");
-               if (instr->flags & IR3_INSTR_P)
-                       printf(".p");
-               if (instr->flags & IR3_INSTR_S)
-                       printf(".s");
-               if (instr->flags & IR3_INSTR_S2EN)
-                       printf(".s2en");
-       }
-}
-
-static void print_reg_name(struct ir3_register *reg)
-{
-       if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
-                       (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
-               printf("(absneg)");
-       else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
-               printf("(neg)");
-       else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
-               printf("(abs)");
-
-       if (reg->flags & IR3_REG_IMMED) {
-               printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-       } else if (reg->flags & IR3_REG_ARRAY) {
-               printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
-                               reg->array.offset, reg->size);
-               /* for ARRAY we could have null src, for example first write
-                * instruction..
-                */
-               if (reg->instr) {
-                       printf(", _[");
-                       print_instr_name(reg->instr);
-                       printf("]");
-               }
-               printf("]");
-       } else if (reg->flags & IR3_REG_SSA) {
-               printf("_[");
-               print_instr_name(reg->instr);
-               printf("]");
-       } else if (reg->flags & IR3_REG_RELATIV) {
-               if (reg->flags & IR3_REG_HALF)
-                       printf("h");
-               if (reg->flags & IR3_REG_CONST)
-                       printf("c<a0.x + %d>", reg->array.offset);
-               else
-                       printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
-       } else {
-               if (reg->flags & IR3_REG_HALF)
-                       printf("h");
-               if (reg->flags & IR3_REG_CONST)
-                       printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-               else
-                       printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
-       }
-}
-
-static void
-tab(int lvl)
-{
-       for (int i = 0; i < lvl; i++)
-               printf("\t");
-}
-
-static void
-print_instr(struct ir3_instruction *instr, int lvl)
-{
-       unsigned i;
-
-       tab(lvl);
-
-       print_instr_name(instr);
-       for (i = 0; i < instr->regs_count; i++) {
-               struct ir3_register *reg = instr->regs[i];
-               printf(i ? ", " : " ");
-               print_reg_name(reg);
-       }
-
-       if (instr->address) {
-               printf(", address=_");
-               printf("[");
-               print_instr_name(instr->address);
-               printf("]");
-       }
-
-       if (instr->cp.left) {
-               printf(", left=_");
-               printf("[");
-               print_instr_name(instr->cp.left);
-               printf("]");
-       }
-
-       if (instr->cp.right) {
-               printf(", right=_");
-               printf("[");
-               print_instr_name(instr->cp.right);
-               printf("]");
-       }
-
-       if (instr->opc == OPC_META_FO) {
-               printf(", off=%d", instr->fo.off);
-       }
-
-       if (is_flow(instr) && instr->cat0.target) {
-               /* the predicate register src is implied: */
-               if (instr->opc == OPC_BR) {
-                       printf(" %sp0.x", instr->cat0.inv ? "!" : "");
-               }
-               printf(", target=block%u", block_id(instr->cat0.target));
-       }
-
-       if (instr->deps_count) {
-               printf(", false-deps:");
-               for (unsigned i = 0; i < instr->deps_count; i++) {
-                       if (i > 0)
-                               printf(", ");
-                       printf("_[");
-                       print_instr_name(instr->deps[i]);
-                       printf("]");
-               }
-       }
-
-       printf("\n");
-}
-
-void ir3_print_instr(struct ir3_instruction *instr)
-{
-       print_instr(instr, 0);
-}
-
-static void
-print_block(struct ir3_block *block, int lvl)
-{
-       tab(lvl); printf("block%u {\n", block_id(block));
-
-       if (block->predecessors_count > 0) {
-               tab(lvl+1);
-               printf("pred: ");
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       if (i)
-                               printf(", ");
-                       printf("block%u", block_id(block->predecessors[i]));
-               }
-               printf("\n");
-       }
-
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               print_instr(instr, lvl+1);
-       }
-
-       tab(lvl+1); printf("/* keeps:\n");
-       for (unsigned i = 0; i < block->keeps_count; i++) {
-               print_instr(block->keeps[i], lvl+2);
-       }
-       tab(lvl+1); printf(" */\n");
-
-       if (block->successors[1]) {
-               /* leading into if/else: */
-               tab(lvl+1);
-               printf("/* succs: if _[");
-               print_instr_name(block->condition);
-               printf("] block%u; else block%u; */\n",
-                               block_id(block->successors[0]),
-                               block_id(block->successors[1]));
-       } else if (block->successors[0]) {
-               tab(lvl+1);
-               printf("/* succs: block%u; */\n",
-                               block_id(block->successors[0]));
-       }
-       tab(lvl); printf("}\n");
-}
-
-void
-ir3_print(struct ir3 *ir)
-{
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
-               print_block(block, 0);
-
-       for (unsigned i = 0; i < ir->noutputs; i++) {
-               if (!ir->outputs[i])
-                       continue;
-               printf("out%d: ", i);
-               print_instr(ir->outputs[i], 0);
-       }
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c

deleted file mode 100644 (file)

index ad09c40..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ /dev/null
@@ -1,1124 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
-
-#include "ir3.h"
-#include "ir3_compiler.h"
-
-/*
- * Register Assignment:
- *
- * Uses the register_allocate util, which implements graph coloring
- * algo with interference classes.  To handle the cases where we need
- * consecutive registers (for example, texture sample instructions),
- * we model these as larger (double/quad/etc) registers which conflict
- * with the corresponding registers in other classes.
- *
- * Additionally we create additional classes for half-regs, which
- * do not conflict with the full-reg classes.  We do need at least
- * sizes 1-4 (to deal w/ texture sample instructions output to half-
- * reg).  At the moment we don't create the higher order half-reg
- * classes as half-reg frequently does not have enough precision
- * for texture coords at higher resolutions.
- *
- * There are some additional cases that we need to handle specially,
- * as the graph coloring algo doesn't understand "partial writes".
- * For example, a sequence like:
- *
- *   add r0.z, ...
- *   sam (f32)(xy)r0.x, ...
- *   ...
- *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
- *
- * In this scenario, we treat r0.xyz as class size 3, which is written
- * (from a use/def perspective) at the 'add' instruction and ignore the
- * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
- * defining instruction, as it is the first to partially write r0.xyz.
- *
- * Note i965 has a similar scenario, which they solve with a virtual
- * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
- * register assignment.  But for us that is horrible from a scheduling
- * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the variable is the
- * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other variable components
- * just define the variable some more.)
- *
- * Arrays of arbitrary size are handled via pre-coloring a consecutive
- * sequence of registers.  Additional scalar (single component) reg
- * names are allocated starting at ctx->class_base[total_class_count]
- * (see arr->base), which are pre-colored.  In the use/def graph direct
- * access is treated as a single element use/def, and indirect access
- * is treated as use or def of all array elements.  (Only the first
- * def is tracked, in case of multiple indirect writes, etc.)
- *
- * TODO arrays that fit in one of the pre-defined class sizes should
- * not need to be pre-colored, but instead could be given a normal
- * vreg name.  (Ignoring this for now since it is a good way to work
- * out the kinks with arbitrary sized arrays.)
- *
- * TODO might be easier for debugging to split this into two passes,
- * the first assigning vreg names in a way that we could ir3_print()
- * the result.
- */
-
-static const unsigned class_sizes[] = {
-       1, 2, 3, 4,
-       4 + 4, /* txd + 1d/2d */
-       4 + 6, /* txd + 3d */
-};
-#define class_count ARRAY_SIZE(class_sizes)
-
-static const unsigned half_class_sizes[] = {
-       1, 2, 3, 4,
-};
-#define half_class_count  ARRAY_SIZE(half_class_sizes)
-
-/* seems to just be used for compute shaders?  Seems like vec1 and vec3
- * are sufficient (for now?)
- */
-static const unsigned high_class_sizes[] = {
-       1, 3,
-};
-#define high_class_count ARRAY_SIZE(high_class_sizes)
-
-#define total_class_count (class_count + half_class_count + high_class_count)
-
-/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * 48)  /* r0 to r47 */
-#define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
-#define FIRST_HIGH_REG       (4 * 48)
-/* Number of virtual regs in a given class: */
-#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
-#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
-#define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
-
-#define HALF_OFFSET          (class_count)
-#define HIGH_OFFSET          (class_count + half_class_count)
-
-/* register-set, created one time, used for all shaders: */
-struct ir3_ra_reg_set {
-       struct ra_regs *regs;
-       unsigned int classes[class_count];
-       unsigned int half_classes[half_class_count];
-       unsigned int high_classes[high_class_count];
-       /* maps flat virtual register space to base gpr: */
-       uint16_t *ra_reg_to_gpr;
-       /* maps cls,gpr to flat virtual register space: */
-       uint16_t **gpr_to_ra_reg;
-};
-
-static void
-build_q_values(unsigned int **q_values, unsigned off,
-               const unsigned *sizes, unsigned count)
-{
-       for (unsigned i = 0; i < count; i++) {
-               q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
-
-               /* From register_allocate.c:
-                *
-                * q(B,C) (indexed by C, B is this register class) in
-                * Runeson/Nyström paper.  This is "how many registers of B could
-                * the worst choice register from C conflict with".
-                *
-                * If we just let the register allocation algorithm compute these
-                * values, is extremely expensive.  However, since all of our
-                * registers are laid out, we can very easily compute them
-                * ourselves.  View the register from C as fixed starting at GRF n
-                * somewhere in the middle, and the register from B as sliding back
-                * and forth.  Then the first register to conflict from B is the
-                * one starting at n - class_size[B] + 1 and the last register to
-                * conflict will start at n + class_size[B] - 1.  Therefore, the
-                * number of conflicts from B is class_size[B] + class_size[C] - 1.
-                *
-                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-                * B | | | | | |n| --> | | | | | | |
-                *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-                *             +-+-+-+-+-+
-                * C           |n| | | | |
-                *             +-+-+-+-+-+
-                *
-                * (Idea copied from brw_fs_reg_allocate.cpp)
-                */
-               for (unsigned j = 0; j < count; j++)
-                       q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
-       }
-}
-
-/* One-time setup of RA register-set, which describes all the possible
- * "virtual" registers and their interferences.  Ie. double register
- * occupies (and conflicts with) two single registers, and so forth.
- * Since registers do not need to be aligned to their class size, they
- * can conflict with other registers in the same class too.  Ie:
- *
- *    Single (base) |  Double
- *    --------------+---------------
- *       R0         |  D0
- *       R1         |  D0 D1
- *       R2         |     D1 D2
- *       R3         |        D2
- *           .. and so on..
- *
- * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
- * really just four scalar registers.  Don't let that confuse you.)
- */
-struct ir3_ra_reg_set *
-ir3_ra_alloc_reg_set(struct ir3_compiler *compiler)
-{
-       struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
-       unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
-       unsigned int **q_values;
-
-       /* calculate # of regs across all classes: */
-       ra_reg_count = 0;
-       for (unsigned i = 0; i < class_count; i++)
-               ra_reg_count += CLASS_REGS(i);
-       for (unsigned i = 0; i < half_class_count; i++)
-               ra_reg_count += HALF_CLASS_REGS(i);
-       for (unsigned i = 0; i < high_class_count; i++)
-               ra_reg_count += HIGH_CLASS_REGS(i);
-
-       /* allocate and populate q_values: */
-       q_values = ralloc_array(set, unsigned *, total_class_count);
-
-       build_q_values(q_values, 0, class_sizes, class_count);
-       build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
-       build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
-
-       /* allocate the reg-set.. */
-       set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
-       set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
-       set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
-
-       /* .. and classes */
-       reg = 0;
-       for (unsigned i = 0; i < class_count; i++) {
-               set->classes[i] = ra_alloc_reg_class(set->regs);
-
-               set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
-
-               for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-                       ra_class_add_reg(set->regs, set->classes[i], reg);
-
-                       set->ra_reg_to_gpr[reg] = j;
-                       set->gpr_to_ra_reg[i][j] = reg;
-
-                       for (unsigned br = j; br < j + class_sizes[i]; br++)
-                               ra_add_transitive_reg_conflict(set->regs, br, reg);
-
-                       reg++;
-               }
-       }
-
-       first_half_reg = reg;
-       base = HALF_OFFSET;
-
-       for (unsigned i = 0; i < half_class_count; i++) {
-               set->half_classes[i] = ra_alloc_reg_class(set->regs);
-
-               set->gpr_to_ra_reg[base + i] =
-                               ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
-
-               for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-                       ra_class_add_reg(set->regs, set->half_classes[i], reg);
-
-                       set->ra_reg_to_gpr[reg] = j;
-                       set->gpr_to_ra_reg[base + i][j] = reg;
-
-                       for (unsigned br = j; br < j + half_class_sizes[i]; br++)
-                               ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
-
-                       reg++;
-               }
-       }
-
-       first_high_reg = reg;
-       base = HIGH_OFFSET;
-
-       for (unsigned i = 0; i < high_class_count; i++) {
-               set->high_classes[i] = ra_alloc_reg_class(set->regs);
-
-               set->gpr_to_ra_reg[base + i] =
-                               ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
-
-               for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
-                       ra_class_add_reg(set->regs, set->high_classes[i], reg);
-
-                       set->ra_reg_to_gpr[reg] = j;
-                       set->gpr_to_ra_reg[base + i][j] = reg;
-
-                       for (unsigned br = j; br < j + high_class_sizes[i]; br++)
-                               ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
-
-                       reg++;
-               }
-       }
-
-       /* starting a6xx, half precision regs conflict w/ full precision regs: */
-       if (compiler->gpu_id >= 600) {
-               /* because of transitivity, we can get away with just setting up
-                * conflicts between the first class of full and half regs:
-                */
-               for (unsigned j = 0; j < CLASS_REGS(0) / 2; j++) {
-                       unsigned freg  = set->gpr_to_ra_reg[0][j];
-                       unsigned hreg0 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 0];
-                       unsigned hreg1 = set->gpr_to_ra_reg[HALF_OFFSET][(j * 2) + 1];
-
-                       ra_add_transitive_reg_conflict(set->regs, freg, hreg0);
-                       ra_add_transitive_reg_conflict(set->regs, freg, hreg1);
-               }
-
-               // TODO also need to update q_values, but for now:
-               ra_set_finalize(set->regs, NULL);
-       } else {
-               ra_set_finalize(set->regs, q_values);
-       }
-
-       ralloc_free(q_values);
-
-       return set;
-}
-
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-       BITSET_WORD *def;        /* variables defined before used in block */
-       BITSET_WORD *use;        /* variables used before defined in block */
-       BITSET_WORD *livein;     /* which defs reach entry point of block */
-       BITSET_WORD *liveout;    /* which defs reach exit point of block */
-};
-
-/* additional instruction-data (per-instruction) */
-struct ir3_ra_instr_data {
-       /* cached instruction 'definer' info: */
-       struct ir3_instruction *defn;
-       int off, sz, cls;
-};
-
-/* register-assign context, per-shader */
-struct ir3_ra_ctx {
-       struct ir3 *ir;
-       gl_shader_stage type;
-       bool frag_face;
-
-       struct ir3_ra_reg_set *set;
-       struct ra_graph *g;
-       unsigned alloc_count;
-       /* one per class, plus one slot for arrays: */
-       unsigned class_alloc_count[total_class_count + 1];
-       unsigned class_base[total_class_count + 1];
-       unsigned instr_cnt;
-       unsigned *def, *use;     /* def/use table */
-       struct ir3_ra_instr_data *instrd;
-};
-
-/* does it conflict? */
-static inline bool
-intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
-{
-       return !((a_start >= b_end) || (b_start >= a_end));
-}
-
-static bool
-is_half(struct ir3_instruction *instr)
-{
-       return !!(instr->regs[0]->flags & IR3_REG_HALF);
-}
-
-static bool
-is_high(struct ir3_instruction *instr)
-{
-       return !!(instr->regs[0]->flags & IR3_REG_HIGH);
-}
-
-static int
-size_to_class(unsigned sz, bool half, bool high)
-{
-       if (high) {
-               for (unsigned i = 0; i < high_class_count; i++)
-                       if (high_class_sizes[i] >= sz)
-                               return i + HIGH_OFFSET;
-       } else if (half) {
-               for (unsigned i = 0; i < half_class_count; i++)
-                       if (half_class_sizes[i] >= sz)
-                               return i + HALF_OFFSET;
-       } else {
-               for (unsigned i = 0; i < class_count; i++)
-                       if (class_sizes[i] >= sz)
-                               return i;
-       }
-       debug_assert(0);
-       return -1;
-}
-
-static bool
-writes_gpr(struct ir3_instruction *instr)
-{
-       if (is_store(instr))
-               return false;
-       /* is dest a normal temp register: */
-       struct ir3_register *reg = instr->regs[0];
-       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
-               return false;
-       if ((reg->num == regid(REG_A0, 0)) ||
-                       (reg->num == regid(REG_P0, 0)))
-               return false;
-       return true;
-}
-
-static bool
-instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
-{
-       if (a->flags & IR3_INSTR_UNUSED)
-               return false;
-       return (a->ip < b->ip);
-}
-
-static struct ir3_instruction *
-get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
-               int *sz, int *off)
-{
-       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-       struct ir3_instruction *d = NULL;
-
-       if (id->defn) {
-               *sz = id->sz;
-               *off = id->off;
-               return id->defn;
-       }
-
-       if (instr->opc == OPC_META_FI) {
-               /* What about the case where collect is subset of array, we
-                * need to find the distance between where actual array starts
-                * and fanin..  that probably doesn't happen currently.
-                */
-               struct ir3_register *src;
-               int dsz, doff;
-
-               /* note: don't use foreach_ssa_src as this gets called once
-                * while assigning regs (which clears SSA flag)
-                */
-               foreach_src_n(src, n, instr) {
-                       struct ir3_instruction *dd;
-                       if (!src->instr)
-                               continue;
-
-                       dd = get_definer(ctx, src->instr, &dsz, &doff);
-
-                       if ((!d) || instr_before(dd, d)) {
-                               d = dd;
-                               *sz = dsz;
-                               *off = doff - n;
-                       }
-               }
-
-       } else if (instr->cp.right || instr->cp.left) {
-               /* covers also the meta:fo case, which ends up w/ single
-                * scalar instructions for each component:
-                */
-               struct ir3_instruction *f = ir3_neighbor_first(instr);
-
-               /* by definition, the entire sequence forms one linked list
-                * of single scalar register nodes (even if some of them may
-                * be fanouts from a texture sample (for example) instr.  We
-                * just need to walk the list finding the first element of
-                * the group defined (lowest ip)
-                */
-               int cnt = 0;
-
-               /* need to skip over unused in the group: */
-               while (f && (f->flags & IR3_INSTR_UNUSED)) {
-                       f = f->cp.right;
-                       cnt++;
-               }
-
-               while (f) {
-                       if ((!d) || instr_before(f, d))
-                               d = f;
-                       if (f == instr)
-                               *off = cnt;
-                       f = f->cp.right;
-                       cnt++;
-               }
-
-               *sz = cnt;
-
-       } else {
-               /* second case is looking directly at the instruction which
-                * produces multiple values (eg, texture sample), rather
-                * than the fanout nodes that point back to that instruction.
-                * This isn't quite right, because it may be part of a larger
-                * group, such as:
-                *
-                *     sam (f32)(xyzw)r0.x, ...
-                *     add r1.x, ...
-                *     add r1.y, ...
-                *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
-                *
-                * need to come up with a better way to handle that case.
-                */
-               if (instr->address) {
-                       *sz = instr->regs[0]->size;
-               } else {
-                       *sz = util_last_bit(instr->regs[0]->wrmask);
-               }
-               *off = 0;
-               d = instr;
-       }
-
-       if (d->opc == OPC_META_FO) {
-               struct ir3_instruction *dd;
-               int dsz, doff;
-
-               dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
-
-               /* by definition, should come before: */
-               debug_assert(instr_before(dd, d));
-
-               *sz = MAX2(*sz, dsz);
-
-               debug_assert(instr->opc == OPC_META_FO);
-               *off = MAX2(*off, instr->fo.off);
-
-               d = dd;
-       }
-
-       id->defn = d;
-       id->sz = *sz;
-       id->off = *off;
-
-       return d;
-}
-
-static void
-ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-               if (instr->regs_count == 0)
-                       continue;
-               /* couple special cases: */
-               if (writes_addr(instr) || writes_pred(instr)) {
-                       id->cls = -1;
-               } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
-                       id->cls = total_class_count;
-               } else {
-                       id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-                       id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
-               }
-       }
-}
-
-/* give each instruction a name (and ip), and count up the # of names
- * of each class
- */
-static void
-ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-#ifdef DEBUG
-               instr->name = ~0;
-#endif
-
-               ctx->instr_cnt++;
-
-               if (instr->regs_count == 0)
-                       continue;
-
-               if (!writes_gpr(instr))
-                       continue;
-
-               if (id->defn != instr)
-                       continue;
-
-               /* arrays which don't fit in one of the pre-defined class
-                * sizes are pre-colored:
-                */
-               if ((id->cls >= 0) && (id->cls < total_class_count)) {
-                       instr->name = ctx->class_alloc_count[id->cls]++;
-                       ctx->alloc_count++;
-               }
-       }
-}
-
-static void
-ra_init(struct ir3_ra_ctx *ctx)
-{
-       unsigned n, base;
-
-       ir3_clear_mark(ctx->ir);
-       n = ir3_count_instructions(ctx->ir);
-
-       ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
-
-       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-               ra_block_find_definers(ctx, block);
-       }
-
-       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-               ra_block_name_instructions(ctx, block);
-       }
-
-       /* figure out the base register name for each class.  The
-        * actual ra name is class_base[cls] + instr->name;
-        */
-       ctx->class_base[0] = 0;
-       for (unsigned i = 1; i <= total_class_count; i++) {
-               ctx->class_base[i] = ctx->class_base[i-1] +
-                               ctx->class_alloc_count[i-1];
-       }
-
-       /* and vreg names for array elements: */
-       base = ctx->class_base[total_class_count];
-       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-               arr->base = base;
-               ctx->class_alloc_count[total_class_count] += arr->length;
-               base += arr->length;
-       }
-       ctx->alloc_count += ctx->class_alloc_count[total_class_count];
-
-       ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
-       ralloc_steal(ctx->g, ctx->instrd);
-       ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-       ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-}
-
-static unsigned
-__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
-{
-       unsigned name;
-       debug_assert(cls >= 0);
-       debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
-       name = ctx->class_base[cls] + defn->name;
-       debug_assert(name < ctx->alloc_count);
-       return name;
-}
-
-static int
-ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
-{
-       /* TODO handle name mapping for arrays */
-       return __ra_name(ctx, id->cls, id->defn);
-}
-
-static void
-ra_destroy(struct ir3_ra_ctx *ctx)
-{
-       ralloc_free(ctx->g);
-}
-
-static void
-ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       struct ir3_ra_block_data *bd;
-       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
-
-#define def(name, instr) \
-               do { \
-                       /* defined on first write: */ \
-                       if (!ctx->def[name]) \
-                               ctx->def[name] = instr->ip; \
-                       ctx->use[name] = instr->ip; \
-                       BITSET_SET(bd->def, name); \
-               } while(0);
-
-#define use(name, instr) \
-               do { \
-                       ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
-                       if (!BITSET_TEST(bd->def, name)) \
-                               BITSET_SET(bd->use, name); \
-               } while(0);
-
-       bd = rzalloc(ctx->g, struct ir3_ra_block_data);
-
-       bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-       bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-       bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
-       bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
-
-       block->data = bd;
-
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               struct ir3_instruction *src;
-               struct ir3_register *reg;
-
-               if (instr->regs_count == 0)
-                       continue;
-
-               /* There are a couple special cases to deal with here:
-                *
-                * fanout: used to split values from a higher class to a lower
-                *     class, for example split the results of a texture fetch
-                *     into individual scalar values;  We skip over these from
-                *     a 'def' perspective, and for a 'use' we walk the chain
-                *     up to the defining instruction.
-                *
-                * fanin: used to collect values from lower class and assemble
-                *     them together into a higher class, for example arguments
-                *     to texture sample instructions;  We consider these to be
-                *     defined at the earliest fanin source.
-                *
-                * Most of this is handled in the get_definer() helper.
-                *
-                * In either case, we trace the instruction back to the original
-                * definer and consider that as the def/use ip.
-                */
-
-               if (writes_gpr(instr)) {
-                       struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-                       struct ir3_register *dst = instr->regs[0];
-
-                       if (dst->flags & IR3_REG_ARRAY) {
-                               struct ir3_array *arr =
-                                       ir3_lookup_array(ctx->ir, dst->array.id);
-                               unsigned i;
-
-                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
-                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
-                               /* set the node class now.. in case we don't encounter
-                                * this array dst again.  From register_alloc algo's
-                                * perspective, these are all single/scalar regs:
-                                */
-                               for (i = 0; i < arr->length; i++) {
-                                       unsigned name = arr->base + i;
-                                       ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
-                               }
-
-                               /* indirect write is treated like a write to all array
-                                * elements, since we don't know which one is actually
-                                * written:
-                                */
-                               if (dst->flags & IR3_REG_RELATIV) {
-                                       for (i = 0; i < arr->length; i++) {
-                                               unsigned name = arr->base + i;
-                                               def(name, instr);
-                                       }
-                               } else {
-                                       unsigned name = arr->base + dst->array.offset;
-                                       def(name, instr);
-                               }
-
-                       } else if (id->defn == instr) {
-                               unsigned name = ra_name(ctx, id);
-
-                               /* since we are in SSA at this point: */
-                               debug_assert(!BITSET_TEST(bd->use, name));
-
-                               def(name, id->defn);
-
-                               if (is_high(id->defn)) {
-                                       ra_set_node_class(ctx->g, name,
-                                                       ctx->set->high_classes[id->cls - HIGH_OFFSET]);
-                               } else if (is_half(id->defn)) {
-                                       ra_set_node_class(ctx->g, name,
-                                                       ctx->set->half_classes[id->cls - HALF_OFFSET]);
-                               } else {
-                                       ra_set_node_class(ctx->g, name,
-                                                       ctx->set->classes[id->cls]);
-                               }
-                       }
-               }
-
-               foreach_src(reg, instr) {
-                       if (reg->flags & IR3_REG_ARRAY) {
-                               struct ir3_array *arr =
-                                       ir3_lookup_array(ctx->ir, reg->array.id);
-                               arr->start_ip = MIN2(arr->start_ip, instr->ip);
-                               arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
-                               /* indirect read is treated like a read fromall array
-                                * elements, since we don't know which one is actually
-                                * read:
-                                */
-                               if (reg->flags & IR3_REG_RELATIV) {
-                                       unsigned i;
-                                       for (i = 0; i < arr->length; i++) {
-                                               unsigned name = arr->base + i;
-                                               use(name, instr);
-                                       }
-                               } else {
-                                       unsigned name = arr->base + reg->array.offset;
-                                       use(name, instr);
-                                       /* NOTE: arrays are not SSA so unconditionally
-                                        * set use bit:
-                                        */
-                                       BITSET_SET(bd->use, name);
-                                       debug_assert(reg->array.offset < arr->length);
-                               }
-                       } else if ((src = ssa(reg)) && writes_gpr(src)) {
-                               unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
-                               use(name, instr);
-                       }
-               }
-       }
-}
-
-static bool
-ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
-{
-       unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
-       bool progress = false;
-
-       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-               struct ir3_ra_block_data *bd = block->data;
-
-               /* update livein: */
-               for (unsigned i = 0; i < bitset_words; i++) {
-                       BITSET_WORD new_livein =
-                               (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
-
-                       if (new_livein & ~bd->livein[i]) {
-                               bd->livein[i] |= new_livein;
-                               progress = true;
-                       }
-               }
-
-               /* update liveout: */
-               for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
-                       struct ir3_block *succ = block->successors[j];
-                       struct ir3_ra_block_data *succ_bd;
-
-                       if (!succ)
-                               continue;
-
-                       succ_bd = succ->data;
-
-                       for (unsigned i = 0; i < bitset_words; i++) {
-                               BITSET_WORD new_liveout =
-                                       (succ_bd->livein[i] & ~bd->liveout[i]);
-
-                               if (new_liveout) {
-                                       bd->liveout[i] |= new_liveout;
-                                       progress = true;
-                               }
-                       }
-               }
-       }
-
-       return progress;
-}
-
-static void
-print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
-{
-       bool first = true;
-       debug_printf("  %s:", name);
-       for (unsigned i = 0; i < cnt; i++) {
-               if (BITSET_TEST(bs, i)) {
-                       if (!first)
-                               debug_printf(",");
-                       debug_printf(" %04u", i);
-                       first = false;
-               }
-       }
-       debug_printf("\n");
-}
-
-static void
-ra_add_interference(struct ir3_ra_ctx *ctx)
-{
-       struct ir3 *ir = ctx->ir;
-
-       /* initialize array live ranges: */
-       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
-               arr->start_ip = ~0;
-               arr->end_ip = 0;
-       }
-
-       /* compute live ranges (use/def) on a block level, also updating
-        * block's def/use bitmasks (used below to calculate per-block
-        * livein/liveout):
-        */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               ra_block_compute_live_ranges(ctx, block);
-       }
-
-       /* update per-block livein/liveout: */
-       while (ra_compute_livein_liveout(ctx)) {}
-
-       if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-               debug_printf("AFTER LIVEIN/OUT:\n");
-               ir3_print(ir);
-               list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-                       struct ir3_ra_block_data *bd = block->data;
-                       debug_printf("block%u:\n", block_id(block));
-                       print_bitset("  def", bd->def, ctx->alloc_count);
-                       print_bitset("  use", bd->use, ctx->alloc_count);
-                       print_bitset("  l/i", bd->livein, ctx->alloc_count);
-                       print_bitset("  l/o", bd->liveout, ctx->alloc_count);
-               }
-               list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
-                       debug_printf("array%u:\n", arr->id);
-                       debug_printf("  length:   %u\n", arr->length);
-                       debug_printf("  start_ip: %u\n", arr->start_ip);
-                       debug_printf("  end_ip:   %u\n", arr->end_ip);
-               }
-       }
-
-       /* extend start/end ranges based on livein/liveout info from cfg: */
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               struct ir3_ra_block_data *bd = block->data;
-
-               for (unsigned i = 0; i < ctx->alloc_count; i++) {
-                       if (BITSET_TEST(bd->livein, i)) {
-                               ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
-                               ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
-                       }
-
-                       if (BITSET_TEST(bd->liveout, i)) {
-                               ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
-                               ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
-                       }
-               }
-
-               list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-                       for (unsigned i = 0; i < arr->length; i++) {
-                               if (BITSET_TEST(bd->livein, i + arr->base)) {
-                                       arr->start_ip = MIN2(arr->start_ip, block->start_ip);
-                               }
-                               if (BITSET_TEST(bd->livein, i + arr->base)) {
-                                       arr->end_ip = MAX2(arr->end_ip, block->end_ip);
-                               }
-                       }
-               }
-       }
-
-       /* need to fix things up to keep outputs live: */
-       for (unsigned i = 0; i < ir->noutputs; i++) {
-               struct ir3_instruction *instr = ir->outputs[i];
-               unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
-               ctx->use[name] = ctx->instr_cnt;
-       }
-
-       for (unsigned i = 0; i < ctx->alloc_count; i++) {
-               for (unsigned j = 0; j < ctx->alloc_count; j++) {
-                       if (intersects(ctx->def[i], ctx->use[i],
-                                       ctx->def[j], ctx->use[j])) {
-                               ra_add_node_interference(ctx->g, i, j);
-                       }
-               }
-       }
-}
-
-/* some instructions need fix-up if dst register is half precision: */
-static void fixup_half_instr_dst(struct ir3_instruction *instr)
-{
-       switch (opc_cat(instr->opc)) {
-       case 1: /* move instructions */
-               instr->cat1.dst_type = half_type(instr->cat1.dst_type);
-               break;
-       case 3:
-               switch (instr->opc) {
-               case OPC_MAD_F32:
-                       instr->opc = OPC_MAD_F16;
-                       break;
-               case OPC_SEL_B32:
-                       instr->opc = OPC_SEL_B16;
-                       break;
-               case OPC_SEL_S32:
-                       instr->opc = OPC_SEL_S16;
-                       break;
-               case OPC_SEL_F32:
-                       instr->opc = OPC_SEL_F16;
-                       break;
-               case OPC_SAD_S32:
-                       instr->opc = OPC_SAD_S16;
-                       break;
-               /* instructions may already be fixed up: */
-               case OPC_MAD_F16:
-               case OPC_SEL_B16:
-               case OPC_SEL_S16:
-               case OPC_SEL_F16:
-               case OPC_SAD_S16:
-                       break;
-               default:
-                       assert(0);
-                       break;
-               }
-               break;
-       case 5:
-               instr->cat5.type = half_type(instr->cat5.type);
-               break;
-       }
-}
-/* some instructions need fix-up if src register is half precision: */
-static void fixup_half_instr_src(struct ir3_instruction *instr)
-{
-       switch (instr->opc) {
-       case OPC_MOV:
-               instr->cat1.src_type = half_type(instr->cat1.src_type);
-               break;
-       default:
-               break;
-       }
-}
-
-/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
- * array access(es) which do not have any previous access to depend
- * on from scheduling point of view
- */
-static void
-reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
-               struct ir3_instruction *instr)
-{
-       struct ir3_ra_instr_data *id;
-
-       if (reg->flags & IR3_REG_ARRAY) {
-               struct ir3_array *arr =
-                       ir3_lookup_array(ctx->ir, reg->array.id);
-               unsigned name = arr->base + reg->array.offset;
-               unsigned r = ra_get_node_reg(ctx->g, name);
-               unsigned num = ctx->set->ra_reg_to_gpr[r];
-
-               if (reg->flags & IR3_REG_RELATIV) {
-                       reg->array.offset = num;
-               } else {
-                       reg->num = num;
-                       reg->flags &= ~IR3_REG_SSA;
-               }
-
-               reg->flags &= ~IR3_REG_ARRAY;
-       } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
-               unsigned name = ra_name(ctx, id);
-               unsigned r = ra_get_node_reg(ctx->g, name);
-               unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
-
-               debug_assert(!(reg->flags & IR3_REG_RELATIV));
-
-               if (is_high(id->defn))
-                       num += FIRST_HIGH_REG;
-
-               reg->num = num;
-               reg->flags &= ~IR3_REG_SSA;
-
-               if (is_half(id->defn))
-                       reg->flags |= IR3_REG_HALF;
-       }
-}
-
-static void
-ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               struct ir3_register *reg;
-
-               if (instr->regs_count == 0)
-                       continue;
-
-               if (writes_gpr(instr)) {
-                       reg_assign(ctx, instr->regs[0], instr);
-                       if (instr->regs[0]->flags & IR3_REG_HALF)
-                               fixup_half_instr_dst(instr);
-               }
-
-               foreach_src_n(reg, n, instr) {
-                       struct ir3_instruction *src = reg->instr;
-                       /* Note: reg->instr could be null for IR3_REG_ARRAY */
-                       if (!(src || (reg->flags & IR3_REG_ARRAY)))
-                               continue;
-                       reg_assign(ctx, instr->regs[n+1], src);
-                       if (instr->regs[n+1]->flags & IR3_REG_HALF)
-                               fixup_half_instr_src(instr);
-               }
-       }
-}
-
-static int
-ra_alloc(struct ir3_ra_ctx *ctx)
-{
-       /* pre-assign array elements:
-        */
-       list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
-               unsigned base = 0;
-
-               if (arr->end_ip == 0)
-                       continue;
-
-               /* figure out what else we conflict with which has already
-                * been assigned:
-                */
-retry:
-               list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
-                       if (arr2 == arr)
-                               break;
-                       if (arr2->end_ip == 0)
-                               continue;
-                       /* if it intersects with liverange AND register range.. */
-                       if (intersects(arr->start_ip, arr->end_ip,
-                                       arr2->start_ip, arr2->end_ip) &&
-                               intersects(base, base + arr->length,
-                                       arr2->reg, arr2->reg + arr2->length)) {
-                               base = MAX2(base, arr2->reg + arr2->length);
-                               goto retry;
-                       }
-               }
-
-               arr->reg = base;
-
-               for (unsigned i = 0; i < arr->length; i++) {
-                       unsigned name, reg;
-
-                       name = arr->base + i;
-                       reg = ctx->set->gpr_to_ra_reg[0][base++];
-
-                       ra_set_node_reg(ctx->g, name, reg);
-               }
-       }
-
-       if (!ra_allocate(ctx->g))
-               return -1;
-
-       list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
-               ra_block_alloc(ctx, block);
-       }
-
-       return 0;
-}
-
-int ir3_ra(struct ir3 *ir, gl_shader_stage type,
-               bool frag_coord, bool frag_face)
-{
-       struct ir3_ra_ctx ctx = {
-                       .ir = ir,
-                       .type = type,
-                       .frag_face = frag_face,
-                       .set = ir->compiler->set,
-       };
-       int ret;
-
-       ra_init(&ctx);
-       ra_add_interference(&ctx);
-       ret = ra_alloc(&ctx);
-       ra_destroy(&ctx);
-
-       return ret;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c

deleted file mode 100644 (file)

index 6552980..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-
-#include "util/u_math.h"
-
-#include "ir3.h"
-
-/*
- * Instruction Scheduling:
- *
- * A recursive depth based scheduling algo.  Recursively find an eligible
- * instruction to schedule from the deepest instruction (recursing through
- * it's unscheduled src instructions).  Normally this would result in a
- * lot of re-traversal of the same instructions, so we cache results in
- * instr->data (and clear cached results that would be no longer valid
- * after scheduling an instruction).
- *
- * There are a few special cases that need to be handled, since sched
- * is currently independent of register allocation.  Usages of address
- * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
- * if you have two pairs of instructions that write the same special
- * register and then read it, then those pairs cannot be interleaved.
- * To solve this, when we are in such a scheduling "critical section",
- * and we encounter a conflicting write to a special register, we try
- * to schedule any remaining instructions that use that value first.
- */
-
-struct ir3_sched_ctx {
-       struct ir3_block *block;           /* the current block */
-       struct list_head depth_list;       /* depth sorted unscheduled instrs */
-       struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
-       struct ir3_instruction *addr;      /* current a0.x user, if any */
-       struct ir3_instruction *pred;      /* current p0.x user, if any */
-       bool error;
-};
-
-static bool is_sfu_or_mem(struct ir3_instruction *instr)
-{
-       return is_sfu(instr) || is_mem(instr);
-}
-
-#define NULL_INSTR ((void *)~0)
-
-static void
-clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-       list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
-               if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
-                       instr2->data = NULL;
-       }
-}
-
-static void
-schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-       debug_assert(ctx->block == instr->block);
-
-       /* maybe there is a better way to handle this than just stuffing
-        * a nop.. ideally we'd know about this constraint in the
-        * scheduling and depth calculation..
-        */
-       if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-               ir3_NOP(ctx->block);
-
-       /* remove from depth list:
-        */
-       list_delinit(&instr->node);
-
-       if (writes_addr(instr)) {
-               debug_assert(ctx->addr == NULL);
-               ctx->addr = instr;
-       }
-
-       if (writes_pred(instr)) {
-               debug_assert(ctx->pred == NULL);
-               ctx->pred = instr;
-       }
-
-       instr->flags |= IR3_INSTR_MARK;
-
-       list_addtail(&instr->node, &instr->block->instr_list);
-       ctx->scheduled = instr;
-
-       if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
-               clear_cache(ctx, NULL);
-       } else {
-               /* invalidate only the necessary entries.. */
-               clear_cache(ctx, instr);
-       }
-}
-
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-       struct ir3_instruction *d = NULL;
-       unsigned i = 0, id = 0;
-
-       while ((i < nsrcs) && !(d = srcs[id = i]))
-               i++;
-
-       if (!d)
-               return NULL;
-
-       for (; i < nsrcs; i++)
-               if (srcs[i] && (srcs[i]->depth > d->depth))
-                       d = srcs[id = i];
-
-       srcs[id] = NULL;
-
-       return d;
-}
-
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
-static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-               unsigned maxd, bool pred)
-{
-       unsigned d = 0;
-
-       list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
-               if ((n == instr) || (d >= maxd))
-                       return d;
-               /* NOTE: don't count branch/jump since we don't know yet if they will
-                * be eliminated later in resolve_jumps().. really should do that
-                * earlier so we don't have this constraint.
-                */
-               if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
-                       d++;
-       }
-
-       /* if coming from a predecessor block, assume it is assigned far
-        * enough away.. we'll fix up later.
-        */
-       if (!pred)
-               return maxd;
-
-       if (pred && (block->data != block)) {
-               /* Search into predecessor blocks, finding the one with the
-                * shortest distance, since that will be the worst case
-                */
-               unsigned min = maxd - d;
-
-               /* (ab)use block->data to prevent recursion: */
-               block->data = block;
-
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       unsigned n;
-
-                       n = distance(block->predecessors[i], instr, min, pred);
-
-                       min = MIN2(min, n);
-               }
-
-               block->data = NULL;
-               d += min;
-       }
-
-       return d;
-}
-
-/* calculate delay for specified src: */
-static unsigned
-delay_calc_srcn(struct ir3_block *block,
-               struct ir3_instruction *assigner,
-               struct ir3_instruction *consumer,
-               unsigned srcn, bool soft, bool pred)
-{
-       unsigned delay = 0;
-
-       if (is_meta(assigner)) {
-               struct ir3_instruction *src;
-               foreach_ssa_src(src, assigner) {
-                       unsigned d;
-                       d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
-                       delay = MAX2(delay, d);
-               }
-       } else {
-               if (soft) {
-                       if (is_sfu(assigner)) {
-                               delay = 4;
-                       } else {
-                               delay = ir3_delayslots(assigner, consumer, srcn);
-                       }
-               } else {
-                       delay = ir3_delayslots(assigner, consumer, srcn);
-               }
-               delay -= distance(block, assigner, delay, pred);
-       }
-
-       return delay;
-}
-
-/* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-               bool soft, bool pred)
-{
-       unsigned delay = 0;
-       struct ir3_instruction *src;
-
-       foreach_ssa_src_n(src, i, instr) {
-               unsigned d;
-               d = delay_calc_srcn(block, src, instr, i, soft, pred);
-               delay = MAX2(delay, d);
-       }
-
-       return delay;
-}
-
-struct ir3_sched_notes {
-       /* there is at least one kill which could be scheduled, except
-        * for unscheduled bary.f's:
-        */
-       bool blocked_kill;
-       /* there is at least one instruction that could be scheduled,
-        * except for conflicting address/predicate register usage:
-        */
-       bool addr_conflict, pred_conflict;
-};
-
-static bool is_scheduled(struct ir3_instruction *instr)
-{
-       return !!(instr->flags & IR3_INSTR_MARK);
-}
-
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
-{
-       struct ir3_instruction *other_src;
-       foreach_ssa_src(other_src, instr) {
-               /* if dependency not scheduled, we aren't ready yet: */
-               if ((src != other_src) && !is_scheduled(other_src)) {
-                       return false;
-               }
-       }
-       return true;
-}
-
-/* Check if instruction is ok to schedule.  Make sure it is not blocked
- * by use of addr/predicate register, etc.
- */
-static bool
-check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               struct ir3_instruction *instr)
-{
-       /* For instructions that write address register we need to
-        * make sure there is at least one instruction that uses the
-        * addr value which is otherwise ready.
-        *
-        * TODO if any instructions use pred register and have other
-        * src args, we would need to do the same for writes_pred()..
-        */
-       if (writes_addr(instr)) {
-               struct ir3 *ir = instr->block->shader;
-               bool ready = false;
-               for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-                       struct ir3_instruction *indirect = ir->indirects[i];
-                       if (!indirect)
-                               continue;
-                       if (indirect->address != instr)
-                               continue;
-                       ready = could_sched(indirect, instr);
-               }
-
-               /* nothing could be scheduled, so keep looking: */
-               if (!ready)
-                       return false;
-       }
-
-       /* if this is a write to address/predicate register, and that
-        * register is currently in use, we need to defer until it is
-        * free:
-        */
-       if (writes_addr(instr) && ctx->addr) {
-               debug_assert(ctx->addr != instr);
-               notes->addr_conflict = true;
-               return false;
-       }
-
-       if (writes_pred(instr) && ctx->pred) {
-               debug_assert(ctx->pred != instr);
-               notes->pred_conflict = true;
-               return false;
-       }
-
-       /* if the instruction is a kill, we need to ensure *every*
-        * bary.f is scheduled.  The hw seems unhappy if the thread
-        * gets killed before the end-input (ei) flag is hit.
-        *
-        * We could do this by adding each bary.f instruction as
-        * virtual ssa src for the kill instruction.  But we have
-        * fixed length instr->regs[].
-        *
-        * TODO this wouldn't be quite right if we had multiple
-        * basic blocks, if any block was conditional.  We'd need
-        * to schedule the bary.f's outside of any block which
-        * was conditional that contained a kill.. I think..
-        */
-       if (is_kill(instr)) {
-               struct ir3 *ir = instr->block->shader;
-
-               for (unsigned i = 0; i < ir->baryfs_count; i++) {
-                       struct ir3_instruction *baryf = ir->baryfs[i];
-                       if (baryf->flags & IR3_INSTR_UNUSED)
-                               continue;
-                       if (!is_scheduled(baryf)) {
-                               notes->blocked_kill = true;
-                               return false;
-                       }
-               }
-       }
-
-       return true;
-}
-
-/* Find the best instruction to schedule from specified instruction or
- * recursively it's ssa sources.
- */
-static struct ir3_instruction *
-find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               struct ir3_instruction *instr)
-{
-       struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
-       struct ir3_instruction *src;
-       unsigned nsrcs = 0;
-
-       if (is_scheduled(instr))
-               return NULL;
-
-       /* use instr->data to cache the results of recursing up the
-        * instr src's.  Otherwise the recursive algo can scale quite
-        * badly w/ shader size.  But this takes some care to clear
-        * the cache appropriately when instructions are scheduled.
-        */
-       if (instr->data) {
-               if (instr->data == NULL_INSTR)
-                       return NULL;
-               return instr->data;
-       }
-
-       /* find unscheduled srcs: */
-       foreach_ssa_src(src, instr) {
-               if (!is_scheduled(src)) {
-                       debug_assert(nsrcs < ARRAY_SIZE(srcs));
-                       srcs[nsrcs++] = src;
-               }
-       }
-
-       /* if all our src's are already scheduled: */
-       if (nsrcs == 0) {
-               if (check_instr(ctx, notes, instr)) {
-                       instr->data = instr;
-                       return instr;
-               }
-               return NULL;
-       }
-
-       while ((src = deepest(srcs, nsrcs))) {
-               struct ir3_instruction *candidate;
-
-               candidate = find_instr_recursive(ctx, notes, src);
-               if (!candidate)
-                       continue;
-
-               if (check_instr(ctx, notes, candidate)) {
-                       instr->data = candidate;
-                       return candidate;
-               }
-       }
-
-       instr->data = NULL_INSTR;
-       return NULL;
-}
-
-/* find instruction to schedule: */
-static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-               bool soft)
-{
-       struct ir3_instruction *best_instr = NULL;
-       unsigned min_delay = ~0;
-
-       /* TODO we'd really rather use the list/array of block outputs.  But we
-        * don't have such a thing.  Recursing *every* instruction in the list
-        * will result in a lot of repeated traversal, since instructions will
-        * get traversed both when they appear as ssa src to a later instruction
-        * as well as where they appear in the depth_list.
-        */
-       list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
-               struct ir3_instruction *candidate;
-               unsigned delay;
-
-               candidate = find_instr_recursive(ctx, notes, instr);
-               if (!candidate)
-                       continue;
-
-               delay = delay_calc(ctx->block, candidate, soft, false);
-               if (delay < min_delay) {
-                       best_instr = candidate;
-                       min_delay = delay;
-               }
-
-               if (min_delay == 0)
-                       break;
-       }
-
-       return best_instr;
-}
-
-/* "spill" the address register by remapping any unscheduled
- * instructions which depend on the current address register
- * to a clone of the instruction which wrote the address reg.
- */
-static struct ir3_instruction *
-split_addr(struct ir3_sched_ctx *ctx)
-{
-       struct ir3 *ir;
-       struct ir3_instruction *new_addr = NULL;
-       unsigned i;
-
-       debug_assert(ctx->addr);
-
-       ir = ctx->addr->block->shader;
-
-       for (i = 0; i < ir->indirects_count; i++) {
-               struct ir3_instruction *indirect = ir->indirects[i];
-
-               if (!indirect)
-                       continue;
-
-               /* skip instructions already scheduled: */
-               if (is_scheduled(indirect))
-                       continue;
-
-               /* remap remaining instructions using current addr
-                * to new addr:
-                */
-               if (indirect->address == ctx->addr) {
-                       if (!new_addr) {
-                               new_addr = ir3_instr_clone(ctx->addr);
-                               /* original addr is scheduled, but new one isn't: */
-                               new_addr->flags &= ~IR3_INSTR_MARK;
-                       }
-                       ir3_instr_set_address(indirect, new_addr);
-               }
-       }
-
-       /* all remaining indirects remapped to new addr: */
-       ctx->addr = NULL;
-
-       return new_addr;
-}
-
-/* "spill" the predicate register by remapping any unscheduled
- * instructions which depend on the current predicate register
- * to a clone of the instruction which wrote the address reg.
- */
-static struct ir3_instruction *
-split_pred(struct ir3_sched_ctx *ctx)
-{
-       struct ir3 *ir;
-       struct ir3_instruction *new_pred = NULL;
-       unsigned i;
-
-       debug_assert(ctx->pred);
-
-       ir = ctx->pred->block->shader;
-
-       for (i = 0; i < ir->predicates_count; i++) {
-               struct ir3_instruction *predicated = ir->predicates[i];
-
-               /* skip instructions already scheduled: */
-               if (is_scheduled(predicated))
-                       continue;
-
-               /* remap remaining instructions using current pred
-                * to new pred:
-                *
-                * TODO is there ever a case when pred isn't first
-                * (and only) src?
-                */
-               if (ssa(predicated->regs[1]) == ctx->pred) {
-                       if (!new_pred) {
-                               new_pred = ir3_instr_clone(ctx->pred);
-                               /* original pred is scheduled, but new one isn't: */
-                               new_pred->flags &= ~IR3_INSTR_MARK;
-                       }
-                       predicated->regs[1]->instr = new_pred;
-               }
-       }
-
-       /* all remaining predicated remapped to new pred: */
-       ctx->pred = NULL;
-
-       return new_pred;
-}
-
-static void
-sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-       struct list_head unscheduled_list;
-
-       ctx->block = block;
-
-       /* addr/pred writes are per-block: */
-       ctx->addr = NULL;
-       ctx->pred = NULL;
-
-       /* move all instructions to the unscheduled list, and
-        * empty the block's instruction list (to which we will
-        * be inserting).
-        */
-       list_replace(&block->instr_list, &unscheduled_list);
-       list_inithead(&block->instr_list);
-       list_inithead(&ctx->depth_list);
-
-       /* first a pre-pass to schedule all meta:input instructions
-        * (which need to appear first so that RA knows the register is
-        * occupied), and move remaining to depth sorted list:
-        */
-       list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-               if (instr->opc == OPC_META_INPUT) {
-                       schedule(ctx, instr);
-               } else {
-                       ir3_insert_by_depth(instr, &ctx->depth_list);
-               }
-       }
-
-       while (!list_empty(&ctx->depth_list)) {
-               struct ir3_sched_notes notes = {0};
-               struct ir3_instruction *instr;
-
-               instr = find_eligible_instr(ctx, &notes, true);
-               if (!instr)
-                       instr = find_eligible_instr(ctx, &notes, false);
-
-               if (instr) {
-                       unsigned delay = delay_calc(ctx->block, instr, false, false);
-
-                       /* and if we run out of instructions that can be scheduled,
-                        * then it is time for nop's:
-                        */
-                       debug_assert(delay <= 6);
-                       while (delay > 0) {
-                               ir3_NOP(block);
-                               delay--;
-                       }
-
-                       schedule(ctx, instr);
-               } else {
-                       struct ir3_instruction *new_instr = NULL;
-
-                       /* nothing available to schedule.. if we are blocked on
-                        * address/predicate register conflict, then break the
-                        * deadlock by cloning the instruction that wrote that
-                        * reg:
-                        */
-                       if (notes.addr_conflict) {
-                               new_instr = split_addr(ctx);
-                       } else if (notes.pred_conflict) {
-                               new_instr = split_pred(ctx);
-                       } else {
-                               debug_assert(0);
-                               ctx->error = true;
-                               return;
-                       }
-
-                       if (new_instr) {
-                               /* clearing current addr/pred can change what is
-                                * available to schedule, so clear cache..
-                                */
-                               clear_cache(ctx, NULL);
-
-                               ir3_insert_by_depth(new_instr, &ctx->depth_list);
-                               /* the original instr that wrote addr/pred may have
-                                * originated from a different block:
-                                */
-                               new_instr->block = block;
-                       }
-               }
-       }
-
-       /* And lastly, insert branch/jump instructions to take us to
-        * the next block.  Later we'll strip back out the branches
-        * that simply jump to next instruction.
-        */
-       if (block->successors[1]) {
-               /* if/else, conditional branches to "then" or "else": */
-               struct ir3_instruction *br;
-               unsigned delay = 6;
-
-               debug_assert(ctx->pred);
-               debug_assert(block->condition);
-
-               delay -= distance(ctx->block, ctx->pred, delay, false);
-
-               while (delay > 0) {
-                       ir3_NOP(block);
-                       delay--;
-               }
-
-               /* create "else" branch first (since "then" block should
-                * frequently/always end up being a fall-thru):
-                */
-               br = ir3_BR(block);
-               br->cat0.inv = true;
-               br->cat0.target = block->successors[1];
-
-               /* NOTE: we have to hard code delay of 6 above, since
-                * we want to insert the nop's before constructing the
-                * branch.  Throw in an assert so we notice if this
-                * ever breaks on future generation:
-                */
-               debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
-
-               br = ir3_BR(block);
-               br->cat0.target = block->successors[0];
-
-       } else if (block->successors[0]) {
-               /* otherwise unconditional jump to next block: */
-               struct ir3_instruction *jmp;
-
-               jmp = ir3_JUMP(block);
-               jmp->cat0.target = block->successors[0];
-       }
-
-       /* NOTE: if we kept track of the predecessors, we could do a better
-        * job w/ (jp) flags.. every node w/ > predecessor is a join point.
-        * Note that as we eliminate blocks which contain only an unconditional
-        * jump we probably need to propagate (jp) flag..
-        */
-}
-
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
-static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
-{
-       unsigned n = 0;
-
-       ctx->block = block;
-
-       list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-               unsigned delay = 0;
-
-               for (unsigned i = 0; i < block->predecessors_count; i++) {
-                       unsigned d = delay_calc(block->predecessors[i], instr, false, true);
-                       delay = MAX2(d, delay);
-               }
-
-               while (delay > n) {
-                       struct ir3_instruction *nop = ir3_NOP(block);
-
-                       /* move to before instr: */
-                       list_delinit(&nop->node);
-                       list_addtail(&nop->node, &instr->node);
-
-                       n++;
-               }
-
-               /* we can bail once we hit worst case delay: */
-               if (++n > 6)
-                       break;
-       }
-}
-
-int ir3_sched(struct ir3 *ir)
-{
-       struct ir3_sched_ctx ctx = {0};
-
-       ir3_clear_mark(ir);
-
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               sched_block(&ctx, block);
-       }
-
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               sched_intra_block(&ctx, block);
-       }
-
-       if (ctx.error)
-               return -1;
-       return 0;
-}
-
-/* does instruction 'prior' need to be scheduled before 'instr'? */
-static bool
-depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
-{
-       /* TODO for dependencies that are related to a specific object, ie
-        * a specific SSBO/image/array, we could relax this constraint to
-        * make accesses to unrelated objects not depend on each other (at
-        * least as long as not declared coherent)
-        */
-       if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
-                       ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
-               return true;
-       return !!(instr->barrier_class & prior->barrier_conflict);
-}
-
-static void
-add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
-{
-       struct list_head *prev = instr->node.prev;
-       struct list_head *next = instr->node.next;
-
-       /* add dependencies on previous instructions that must be scheduled
-        * prior to the current instruction
-        */
-       while (prev != &block->instr_list) {
-               struct ir3_instruction *pi =
-                       LIST_ENTRY(struct ir3_instruction, prev, node);
-
-               prev = prev->prev;
-
-               if (is_meta(pi))
-                       continue;
-
-               if (instr->barrier_class == pi->barrier_class) {
-                       ir3_instr_add_dep(instr, pi);
-                       break;
-               }
-
-               if (depends_on(instr, pi))
-                       ir3_instr_add_dep(instr, pi);
-       }
-
-       /* add dependencies on this instruction to following instructions
-        * that must be scheduled after the current instruction:
-        */
-       while (next != &block->instr_list) {
-               struct ir3_instruction *ni =
-                       LIST_ENTRY(struct ir3_instruction, next, node);
-
-               next = next->next;
-
-               if (is_meta(ni))
-                       continue;
-
-               if (instr->barrier_class == ni->barrier_class) {
-                       ir3_instr_add_dep(ni, instr);
-                       break;
-               }
-
-               if (depends_on(ni, instr))
-                       ir3_instr_add_dep(ni, instr);
-       }
-}
-
-/* before scheduling a block, we need to add any necessary false-dependencies
- * to ensure that:
- *
- *  (1) barriers are scheduled in the right order wrt instructions related
- *      to the barrier
- *
- *  (2) reads that come before a write actually get scheduled before the
- *      write
- */
-static void
-calculate_deps(struct ir3_block *block)
-{
-       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-               if (instr->barrier_class) {
-                       add_barrier_deps(block, instr);
-               }
-       }
-}
-
-void
-ir3_sched_add_deps(struct ir3 *ir)
-{
-       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-               calculate_deps(block);
-       }
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c

deleted file mode 100644 (file)

index b58a204..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ /dev/null
@@ -1,436 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_format.h"
-
-#include "freedreno_util.h"
-
-#include "ir3_shader.h"
-#include "ir3_compiler.h"
-#include "ir3_nir.h"
-
-int
-ir3_glsl_type_size(const struct glsl_type *type)
-{
-       return glsl_count_attribute_slots(type, false);
-}
-
-static void
-delete_variant(struct ir3_shader_variant *v)
-{
-       if (v->ir)
-               ir3_destroy(v->ir);
-       if (v->bo)
-               fd_bo_del(v->bo);
-       if (v->immediates)
-               free(v->immediates);
-       free(v);
-}
-
-/* for vertex shader, the inputs are loaded into registers before the shader
- * is executed, so max_regs from the shader instructions might not properly
- * reflect the # of registers actually used, especially in case passthrough
- * varyings.
- *
- * Likewise, for fragment shader, we can have some regs which are passed
- * input values but never touched by the resulting shader (ie. as result
- * of dead code elimination or simply because we don't know how to turn
- * the reg off.
- */
-static void
-fixup_regfootprint(struct ir3_shader_variant *v)
-{
-       unsigned i;
-
-       for (i = 0; i < v->inputs_count; i++) {
-               /* skip frag inputs fetch via bary.f since their reg's are
-                * not written by gpu before shader starts (and in fact the
-                * regid's might not even be valid)
-                */
-               if (v->inputs[i].bary)
-                       continue;
-
-               /* ignore high regs that are global to all threads in a warp
-                * (they exist by default) (a5xx+)
-                */
-               if (v->inputs[i].regid >= regid(48,0))
-                       continue;
-
-               if (v->inputs[i].compmask) {
-                       unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
-                       int32_t regid = (v->inputs[i].regid + n) >> 2;
-                       v->info.max_reg = MAX2(v->info.max_reg, regid);
-               }
-       }
-
-       for (i = 0; i < v->outputs_count; i++) {
-               int32_t regid = (v->outputs[i].regid + 3) >> 2;
-               v->info.max_reg = MAX2(v->info.max_reg, regid);
-       }
-}
-
-/* wrapper for ir3_assemble() which does some info fixup based on
- * shader state.  Non-static since used by ir3_cmdline too.
- */
-void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
-{
-       void *bin;
-
-       bin = ir3_assemble(v->ir, &v->info, gpu_id);
-       if (!bin)
-               return NULL;
-
-       if (gpu_id >= 400) {
-               v->instrlen = v->info.sizedwords / (2 * 16);
-       } else {
-               v->instrlen = v->info.sizedwords / (2 * 4);
-       }
-
-       /* NOTE: if relative addressing is used, we set constlen in
-        * the compiler (to worst-case value) since we don't know in
-        * the assembler what the max addr reg value can be:
-        */
-       v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
-
-       fixup_regfootprint(v);
-
-       return bin;
-}
-
-static void
-assemble_variant(struct ir3_shader_variant *v)
-{
-       struct ir3_compiler *compiler = v->shader->compiler;
-       uint32_t gpu_id = compiler->gpu_id;
-       uint32_t sz, *bin;
-
-       bin = ir3_shader_assemble(v, gpu_id);
-       sz = v->info.sizedwords * 4;
-
-       v->bo = fd_bo_new(compiler->dev, sz,
-                       DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
-                       DRM_FREEDRENO_GEM_TYPE_KMEM);
-
-       memcpy(fd_bo_map(v->bo), bin, sz);
-
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               struct ir3_shader_key key = v->key;
-               printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
-                       v->binning_pass, key.color_two_side, key.half_precision);
-               ir3_shader_disasm(v, bin, stdout);
-       }
-
-       if (shader_debug_enabled(v->shader->type)) {
-               fprintf(stderr, "Native code for unnamed %s shader %s:\n",
-                       _mesa_shader_stage_to_string(v->shader->type),
-                       v->shader->nir->info.name);
-               if (v->shader->type == MESA_SHADER_FRAGMENT)
-                       fprintf(stderr, "SIMD0\n");
-               ir3_shader_disasm(v, bin, stderr);
-       }
-
-       free(bin);
-
-       /* no need to keep the ir around beyond this point: */
-       ir3_destroy(v->ir);
-       v->ir = NULL;
-}
-
-static struct ir3_shader_variant *
-create_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-               bool binning_pass)
-{
-       struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
-       int ret;
-
-       if (!v)
-               return NULL;
-
-       v->id = ++shader->variant_count;
-       v->shader = shader;
-       v->binning_pass = binning_pass;
-       v->key = *key;
-       v->type = shader->type;
-
-       ret = ir3_compile_shader_nir(shader->compiler, v);
-       if (ret) {
-               debug_error("compile failed!");
-               goto fail;
-       }
-
-       assemble_variant(v);
-       if (!v->bo) {
-               debug_error("assemble failed!");
-               goto fail;
-       }
-
-       return v;
-
-fail:
-       delete_variant(v);
-       return NULL;
-}
-
-static inline struct ir3_shader_variant *
-shader_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-               bool *created)
-{
-       struct ir3_shader_variant *v;
-
-       *created = false;
-
-       for (v = shader->variants; v; v = v->next)
-               if (ir3_shader_key_equal(key, &v->key))
-                       return v;
-
-       /* compile new variant if it doesn't exist already: */
-       v = create_variant(shader, key, false);
-       if (v) {
-               v->next = shader->variants;
-               shader->variants = v;
-               *created = true;
-       }
-
-       return v;
-}
-
-struct ir3_shader_variant *
-ir3_shader_get_variant(struct ir3_shader *shader, struct ir3_shader_key *key,
-               bool binning_pass, bool *created)
-{
-       struct ir3_shader_variant *v =
-                       shader_variant(shader, key, created);
-
-       if (binning_pass) {
-               if (!v->binning)
-                       v->binning = create_variant(shader, key, true);
-               return v->binning;
-       }
-
-       return v;
-}
-
-void
-ir3_shader_destroy(struct ir3_shader *shader)
-{
-       struct ir3_shader_variant *v, *t;
-       for (v = shader->variants; v; ) {
-               t = v;
-               v = v->next;
-               delete_variant(t);
-       }
-       ralloc_free(shader->nir);
-       free(shader);
-}
-
-struct ir3_shader *
-ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
-{
-       struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
-
-       shader->compiler = compiler;
-       shader->id = ++shader->compiler->shader_count;
-       shader->type = nir->info.stage;
-
-       NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
-                          (nir_lower_io_options)0);
-
-       /* do first pass optimization, ignoring the key: */
-       shader->nir = ir3_optimize_nir(shader, nir, NULL);
-       if (ir3_shader_debug & IR3_DBG_DISASM) {
-               printf("dump nir%d: type=%d", shader->id, shader->type);
-               nir_print_shader(shader->nir, stdout);
-       }
-
-       return shader;
-}
-
-static void dump_reg(FILE *out, const char *name, uint32_t r)
-{
-       if (r != regid(63,0))
-               fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
-}
-
-static void dump_output(FILE *out, struct ir3_shader_variant *so,
-               unsigned slot, const char *name)
-{
-       uint32_t regid;
-       regid = ir3_find_output_regid(so, slot);
-       dump_reg(out, name, regid);
-}
-
-void
-ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
-{
-       struct ir3 *ir = so->ir;
-       struct ir3_register *reg;
-       const char *type = ir3_shader_stage(so->shader);
-       uint8_t regid;
-       unsigned i;
-
-       for (i = 0; i < ir->ninputs; i++) {
-               if (!ir->inputs[i]) {
-                       fprintf(out, "; in%d unused\n", i);
-                       continue;
-               }
-               reg = ir->inputs[i]->regs[0];
-               regid = reg->num;
-               fprintf(out, "@in(%sr%d.%c)\tin%d\n",
-                               (reg->flags & IR3_REG_HALF) ? "h" : "",
-                               (regid >> 2), "xyzw"[regid & 0x3], i);
-       }
-
-       for (i = 0; i < ir->noutputs; i++) {
-               if (!ir->outputs[i]) {
-                       fprintf(out, "; out%d unused\n", i);
-                       continue;
-               }
-               /* kill shows up as a virtual output.. skip it! */
-               if (is_kill(ir->outputs[i]))
-                       continue;
-               reg = ir->outputs[i]->regs[0];
-               regid = reg->num;
-               fprintf(out, "@out(%sr%d.%c)\tout%d\n",
-                               (reg->flags & IR3_REG_HALF) ? "h" : "",
-                               (regid >> 2), "xyzw"[regid & 0x3], i);
-       }
-
-       for (i = 0; i < so->immediates_count; i++) {
-               fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
-               fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
-                               so->immediates[i].val[0],
-                               so->immediates[i].val[1],
-                               so->immediates[i].val[2],
-                               so->immediates[i].val[3]);
-       }
-
-       disasm_a3xx(bin, so->info.sizedwords, 0, out);
-
-       switch (so->type) {
-       case MESA_SHADER_VERTEX:
-               fprintf(out, "; %s: outputs:", type);
-               for (i = 0; i < so->outputs_count; i++) {
-                       uint8_t regid = so->outputs[i].regid;
-                       fprintf(out, " r%d.%c (%s)",
-                                       (regid >> 2), "xyzw"[regid & 0x3],
-                                       gl_varying_slot_name(so->outputs[i].slot));
-               }
-               fprintf(out, "\n");
-               fprintf(out, "; %s: inputs:", type);
-               for (i = 0; i < so->inputs_count; i++) {
-                       uint8_t regid = so->inputs[i].regid;
-                       fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
-                                       (regid >> 2), "xyzw"[regid & 0x3],
-                                       so->inputs[i].compmask,
-                                       so->inputs[i].inloc,
-                                       so->inputs[i].bary);
-               }
-               fprintf(out, "\n");
-               break;
-       case MESA_SHADER_FRAGMENT:
-               fprintf(out, "; %s: outputs:", type);
-               for (i = 0; i < so->outputs_count; i++) {
-                       uint8_t regid = so->outputs[i].regid;
-                       fprintf(out, " r%d.%c (%s)",
-                                       (regid >> 2), "xyzw"[regid & 0x3],
-                                       gl_frag_result_name(so->outputs[i].slot));
-               }
-               fprintf(out, "\n");
-               fprintf(out, "; %s: inputs:", type);
-               for (i = 0; i < so->inputs_count; i++) {
-                       uint8_t regid = so->inputs[i].regid;
-                       fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
-                                       (regid >> 2), "xyzw"[regid & 0x3],
-                                       gl_varying_slot_name(so->inputs[i].slot),
-                                       so->inputs[i].compmask,
-                                       so->inputs[i].inloc,
-                                       so->inputs[i].bary);
-               }
-               fprintf(out, "\n");
-               break;
-       default:
-               /* TODO */
-               break;
-       }
-
-       /* print generic shader info: */
-       fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
-                       type, so->shader->id, so->id,
-                       so->info.instrs_count,
-                       so->info.max_half_reg + 1,
-                       so->info.max_reg + 1);
-
-       fprintf(out, "; %d const, %u constlen\n",
-                       so->info.max_const + 1,
-                       so->constlen);
-
-       fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
-
-       /* print shader type specific info: */
-       switch (so->type) {
-       case MESA_SHADER_VERTEX:
-               dump_output(out, so, VARYING_SLOT_POS, "pos");
-               dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
-               break;
-       case MESA_SHADER_FRAGMENT:
-               dump_reg(out, "pos (bary)",
-                       ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
-               dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
-               if (so->color0_mrt) {
-                       dump_output(out, so, FRAG_RESULT_COLOR, "color");
-               } else {
-                       dump_output(out, so, FRAG_RESULT_DATA0, "data0");
-                       dump_output(out, so, FRAG_RESULT_DATA1, "data1");
-                       dump_output(out, so, FRAG_RESULT_DATA2, "data2");
-                       dump_output(out, so, FRAG_RESULT_DATA3, "data3");
-                       dump_output(out, so, FRAG_RESULT_DATA4, "data4");
-                       dump_output(out, so, FRAG_RESULT_DATA5, "data5");
-                       dump_output(out, so, FRAG_RESULT_DATA6, "data6");
-                       dump_output(out, so, FRAG_RESULT_DATA7, "data7");
-               }
-               /* these two are hard-coded since we don't know how to
-                * program them to anything but all 0's...
-                */
-               if (so->frag_coord)
-                       fprintf(out, "; fragcoord: r0.x\n");
-               if (so->frag_face)
-                       fprintf(out, "; fragface: hr0.x\n");
-               break;
-       default:
-               /* TODO */
-               break;
-       }
-
-       fprintf(out, "\n");
-}
-
-uint64_t
-ir3_shader_outputs(const struct ir3_shader *so)
-{
-       return so->nir->info.outputs_written;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h

deleted file mode 100644 (file)

index bc47160..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#ifndef IR3_SHADER_H_
-#define IR3_SHADER_H_
-
-#include <stdio.h>
-
-#include "compiler/shader_enums.h"
-#include "compiler/nir/nir.h"
-#include "util/bitscan.h"
-
-#include "ir3.h"
-
-struct glsl_type;
-
-/* driver param indices: */
-enum ir3_driver_param {
-       /* compute shader driver params: */
-       IR3_DP_NUM_WORK_GROUPS_X = 0,
-       IR3_DP_NUM_WORK_GROUPS_Y = 1,
-       IR3_DP_NUM_WORK_GROUPS_Z = 2,
-       IR3_DP_LOCAL_GROUP_SIZE_X = 4,
-       IR3_DP_LOCAL_GROUP_SIZE_Y = 5,
-       IR3_DP_LOCAL_GROUP_SIZE_Z = 6,
-       /* NOTE: gl_NumWorkGroups should be vec4 aligned because
-        * glDispatchComputeIndirect() needs to load these from
-        * the info->indirect buffer.  Keep that in mind when/if
-        * adding any addition CS driver params.
-        */
-       IR3_DP_CS_COUNT   = 8,   /* must be aligned to vec4 */
-
-       /* vertex shader driver params: */
-       IR3_DP_VTXID_BASE = 0,
-       IR3_DP_VTXCNT_MAX = 1,
-       /* user-clip-plane components, up to 8x vec4's: */
-       IR3_DP_UCP0_X     = 4,
-       /* .... */
-       IR3_DP_UCP7_W     = 35,
-       IR3_DP_VS_COUNT   = 36   /* must be aligned to vec4 */
-};
-
-#define IR3_MAX_SHADER_BUFFERS   32
-#define IR3_MAX_SHADER_IMAGES    32
-#define IR3_MAX_SO_BUFFERS        4
-#define IR3_MAX_SO_OUTPUTS       64
-
-/**
- * For consts needed to pass internal values to shader which may or may not
- * be required, rather than allocating worst-case const space, we scan the
- * shader and allocate consts as-needed:
- *
- *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
- *     for a given SSBO
- *
- *   + Image dimensions: needed to calculate pixel offset, but only for
- *     images that have a image_store intrinsic
- */
-struct ir3_driver_const_layout {
-       struct {
-               uint32_t mask;  /* bitmask of SSBOs that have get_buffer_size */
-               uint32_t count; /* number of consts allocated */
-               /* one const allocated per SSBO which has get_buffer_size,
-                * ssbo_sizes.off[ssbo_id] is offset from start of ssbo_sizes
-                * consts:
-                */
-               uint32_t off[IR3_MAX_SHADER_BUFFERS];
-       } ssbo_size;
-
-       struct {
-               uint32_t mask;  /* bitmask of images that have image_store */
-               uint32_t count; /* number of consts allocated */
-               /* three const allocated per image which has image_store:
-                *  + cpp         (bytes per pixel)
-                *  + pitch       (y pitch)
-                *  + array_pitch (z pitch)
-                */
-               uint32_t off[IR3_MAX_SHADER_IMAGES];
-       } image_dims;
-};
-
-/**
- * A single output for vertex transform feedback.
- */
-struct ir3_stream_output {
-       unsigned register_index:6;  /**< 0 to 63 (OUT index) */
-       unsigned start_component:2; /** 0 to 3 */
-       unsigned num_components:3;  /** 1 to 4 */
-       unsigned output_buffer:3;   /**< 0 to PIPE_MAX_SO_BUFFERS */
-       unsigned dst_offset:16;     /**< offset into the buffer in dwords */
-       unsigned stream:2;          /**< 0 to 3 */
-};
-
-/**
- * Stream output for vertex transform feedback.
- */
-struct ir3_stream_output_info {
-       unsigned num_outputs;
-       /** stride for an entire vertex for each buffer in dwords */
-       uint16_t stride[IR3_MAX_SO_BUFFERS];
-
-       /**
-        * Array of stream outputs, in the order they are to be written in.
-        * Selected components are tightly packed into the output buffer.
-        */
-       struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
-};
-
-/* Configuration key used to identify a shader variant.. different
- * shader variants can be used to implement features not supported
- * in hw (two sided color), binning-pass vertex shader, etc.
- */
-struct ir3_shader_key {
-       union {
-               struct {
-                       /*
-                        * Combined Vertex/Fragment shader parameters:
-                        */
-                       unsigned ucp_enables : 8;
-
-                       /* do we need to check {v,f}saturate_{s,t,r}? */
-                       unsigned has_per_samp : 1;
-
-                       /*
-                        * Vertex shader variant parameters:
-                        */
-                       unsigned vclamp_color : 1;
-
-                       /*
-                        * Fragment shader variant parameters:
-                        */
-                       unsigned color_two_side : 1;
-                       unsigned half_precision : 1;
-                       /* used when shader needs to handle flat varyings (a4xx)
-                        * for front/back color inputs to frag shader:
-                        */
-                       unsigned rasterflat : 1;
-                       unsigned fclamp_color : 1;
-               };
-               uint32_t global;
-       };
-
-       /* bitmask of sampler which needs coords clamped for vertex
-        * shader:
-        */
-       uint16_t vsaturate_s, vsaturate_t, vsaturate_r;
-
-       /* bitmask of sampler which needs coords clamped for frag
-        * shader:
-        */
-       uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
-
-       /* bitmask of ms shifts */
-       uint32_t vsamples, fsamples;
-
-       /* bitmask of samplers which need astc srgb workaround: */
-       uint16_t vastc_srgb, fastc_srgb;
-};
-
-static inline bool
-ir3_shader_key_equal(struct ir3_shader_key *a, struct ir3_shader_key *b)
-{
-       /* slow-path if we need to check {v,f}saturate_{s,t,r} */
-       if (a->has_per_samp || b->has_per_samp)
-               return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
-       return a->global == b->global;
-}
-
-/* will the two keys produce different lowering for a fragment shader? */
-static inline bool
-ir3_shader_key_changes_fs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
-{
-       if (last_key->has_per_samp || key->has_per_samp) {
-               if ((last_key->fsaturate_s != key->fsaturate_s) ||
-                               (last_key->fsaturate_t != key->fsaturate_t) ||
-                               (last_key->fsaturate_r != key->fsaturate_r) ||
-                               (last_key->fsamples != key->fsamples) ||
-                               (last_key->fastc_srgb != key->fastc_srgb))
-                       return true;
-       }
-
-       if (last_key->fclamp_color != key->fclamp_color)
-               return true;
-
-       if (last_key->color_two_side != key->color_two_side)
-               return true;
-
-       if (last_key->half_precision != key->half_precision)
-               return true;
-
-       if (last_key->rasterflat != key->rasterflat)
-               return true;
-
-       if (last_key->ucp_enables != key->ucp_enables)
-               return true;
-
-       return false;
-}
-
-/* will the two keys produce different lowering for a vertex shader? */
-static inline bool
-ir3_shader_key_changes_vs(struct ir3_shader_key *key, struct ir3_shader_key *last_key)
-{
-       if (last_key->has_per_samp || key->has_per_samp) {
-               if ((last_key->vsaturate_s != key->vsaturate_s) ||
-                               (last_key->vsaturate_t != key->vsaturate_t) ||
-                               (last_key->vsaturate_r != key->vsaturate_r) ||
-                               (last_key->vsamples != key->vsamples) ||
-                               (last_key->vastc_srgb != key->vastc_srgb))
-                       return true;
-       }
-
-       if (last_key->vclamp_color != key->vclamp_color)
-               return true;
-
-       if (last_key->ucp_enables != key->ucp_enables)
-               return true;
-
-       return false;
-}
-
-/* clears shader-key flags which don't apply to the given shader
- * stage
- */
-static inline void
-ir3_normalize_key(struct ir3_shader_key *key, gl_shader_stage type)
-{
-       switch (type) {
-       case MESA_SHADER_FRAGMENT:
-               if (key->has_per_samp) {
-                       key->vsaturate_s = 0;
-                       key->vsaturate_t = 0;
-                       key->vsaturate_r = 0;
-                       key->vastc_srgb = 0;
-                       key->vsamples = 0;
-               }
-               break;
-       case MESA_SHADER_VERTEX:
-               key->color_two_side = false;
-               key->half_precision = false;
-               key->rasterflat = false;
-               if (key->has_per_samp) {
-                       key->fsaturate_s = 0;
-                       key->fsaturate_t = 0;
-                       key->fsaturate_r = 0;
-                       key->fastc_srgb = 0;
-                       key->fsamples = 0;
-               }
-               break;
-       default:
-               /* TODO */
-               break;
-       }
-
-}
-
-struct ir3_shader_variant {
-       struct fd_bo *bo;
-
-       /* variant id (for debug) */
-       uint32_t id;
-
-       struct ir3_shader_key key;
-
-       /* vertex shaders can have an extra version for hwbinning pass,
-        * which is pointed to by so->binning:
-        */
-       bool binning_pass;
-       struct ir3_shader_variant *binning;
-
-       struct ir3_driver_const_layout const_layout;
-       struct ir3_info info;
-       struct ir3 *ir;
-
-       /* the instructions length is in units of instruction groups
-        * (4 instructions for a3xx, 16 instructions for a4xx.. each
-        * instruction is 2 dwords):
-        */
-       unsigned instrlen;
-
-       /* the constants length is in units of vec4's, and is the sum of
-        * the uniforms and the built-in compiler constants
-        */
-       unsigned constlen;
-
-       /* number of uniforms (in vec4), not including built-in compiler
-        * constants, etc.
-        */
-       unsigned num_uniforms;
-
-       unsigned num_ubos;
-
-       /* About Linkage:
-        *   + Let the frag shader determine the position/compmask for the
-        *     varyings, since it is the place where we know if the varying
-        *     is actually used, and if so, which components are used.  So
-        *     what the hw calls "outloc" is taken from the "inloc" of the
-        *     frag shader.
-        *   + From the vert shader, we only need the output regid
-        */
-
-       bool frag_coord, frag_face, color0_mrt;
-
-       /* NOTE: for input/outputs, slot is:
-        *   gl_vert_attrib  - for VS inputs
-        *   gl_varying_slot - for VS output / FS input
-        *   gl_frag_result  - for FS output
-        */
-
-       /* varyings/outputs: */
-       unsigned outputs_count;
-       struct {
-               uint8_t slot;
-               uint8_t regid;
-       } outputs[16 + 2];  /* +POSITION +PSIZE */
-       bool writes_pos, writes_psize;
-
-       /* attributes (VS) / varyings (FS):
-        * Note that sysval's should come *after* normal inputs.
-        */
-       unsigned inputs_count;
-       struct {
-               uint8_t slot;
-               uint8_t regid;
-               uint8_t compmask;
-               uint8_t ncomp;
-               /* location of input (ie. offset passed to bary.f, etc).  This
-                * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
-                * have the OUTLOCn value offset by 8, presumably to account
-                * for gl_Position/gl_PointSize)
-                */
-               uint8_t inloc;
-               /* vertex shader specific: */
-               bool    sysval     : 1;   /* slot is a gl_system_value */
-               /* fragment shader specific: */
-               bool    bary       : 1;   /* fetched varying (vs one loaded into reg) */
-               bool    rasterflat : 1;   /* special handling for emit->rasterflat */
-               enum glsl_interp_mode interpolate;
-       } inputs[16 + 2];  /* +POSITION +FACE */
-
-       /* sum of input components (scalar).  For frag shaders, it only counts
-        * the varying inputs:
-        */
-       unsigned total_in;
-
-       /* For frag shaders, the total number of inputs (not scalar,
-        * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
-        */
-       unsigned varying_in;
-
-       /* number of samplers/textures (which are currently 1:1): */
-       int num_samp;
-
-       /* do we have one or more SSBO instructions: */
-       bool has_ssbo;
-
-       /* do we have kill instructions: */
-       bool has_kill;
-
-       /* Layout of constant registers, each section (in vec4). Pointer size
-        * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
-        * UBO and stream-out consts.
-        */
-       struct {
-               /* user const start at zero */
-               unsigned ubo;
-               /* NOTE that a3xx might need a section for SSBO addresses too */
-               unsigned ssbo_sizes;
-               unsigned image_dims;
-               unsigned driver_param;
-               unsigned tfbo;
-               unsigned immediate;
-       } constbase;
-
-       unsigned immediates_count;
-       unsigned immediates_size;
-       struct {
-               uint32_t val[4];
-       } *immediates;
-
-       /* for astc srgb workaround, the number/base of additional
-        * alpha tex states we need, and index of original tex states
-        */
-       struct {
-               unsigned base, count;
-               unsigned orig_idx[16];
-       } astc_srgb;
-
-       /* shader variants form a linked list: */
-       struct ir3_shader_variant *next;
-
-       /* replicated here to avoid passing extra ptrs everywhere: */
-       gl_shader_stage type;
-       struct ir3_shader *shader;
-};
-
-struct ir3_shader {
-       gl_shader_stage type;
-
-       /* shader id (for debug): */
-       uint32_t id;
-       uint32_t variant_count;
-
-       /* so we know when we can disable TGSI related hacks: */
-       bool from_tgsi;
-
-       struct ir3_compiler *compiler;
-
-       struct nir_shader *nir;
-       struct ir3_stream_output_info stream_output;
-
-       struct ir3_shader_variant *variants;
-};
-
-void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
-struct ir3_shader_variant * ir3_shader_get_variant(struct ir3_shader *shader,
-               struct ir3_shader_key *key, bool binning_pass, bool *created);
-struct ir3_shader * ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir);
-void ir3_shader_destroy(struct ir3_shader *shader);
-void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
-uint64_t ir3_shader_outputs(const struct ir3_shader *so);
-
-int
-ir3_glsl_type_size(const struct glsl_type *type);
-
-static inline const char *
-ir3_shader_stage(struct ir3_shader *shader)
-{
-       switch (shader->type) {
-       case MESA_SHADER_VERTEX:     return "VERT";
-       case MESA_SHADER_FRAGMENT:   return "FRAG";
-       case MESA_SHADER_COMPUTE:    return "CL";
-       default:
-               unreachable("invalid type");
-               return NULL;
-       }
-}
-
-/*
- * Helper/util:
- */
-
-static inline int
-ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
-{
-       int j;
-
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot)
-                       return j;
-
-       /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
-        * in the vertex shader.. but the fragment shader doesn't know this
-        * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
-        * at link time if there is no matching OUT.BCOLOR[n], we must map
-        * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
-        * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
-        */
-       if (slot == VARYING_SLOT_BFC0) {
-               slot = VARYING_SLOT_COL0;
-       } else if (slot == VARYING_SLOT_BFC1) {
-               slot = VARYING_SLOT_COL1;
-       } else if (slot == VARYING_SLOT_COL0) {
-               slot = VARYING_SLOT_BFC0;
-       } else if (slot == VARYING_SLOT_COL1) {
-               slot = VARYING_SLOT_BFC1;
-       } else {
-               return 0;
-       }
-
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot)
-                       return j;
-
-       debug_assert(0);
-
-       return 0;
-}
-
-static inline int
-ir3_next_varying(const struct ir3_shader_variant *so, int i)
-{
-       while (++i < so->inputs_count)
-               if (so->inputs[i].compmask && so->inputs[i].bary)
-                       break;
-       return i;
-}
-
-struct ir3_shader_linkage {
-       uint8_t max_loc;
-       uint8_t cnt;
-       struct {
-               uint8_t regid;
-               uint8_t compmask;
-               uint8_t loc;
-       } var[32];
-};
-
-static inline void
-ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
-{
-       int i = l->cnt++;
-
-       debug_assert(i < ARRAY_SIZE(l->var));
-
-       l->var[i].regid    = regid;
-       l->var[i].compmask = compmask;
-       l->var[i].loc      = loc;
-       l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
-}
-
-static inline void
-ir3_link_shaders(struct ir3_shader_linkage *l,
-               const struct ir3_shader_variant *vs,
-               const struct ir3_shader_variant *fs)
-{
-       int j = -1, k;
-
-       while (l->cnt < ARRAY_SIZE(l->var)) {
-               j = ir3_next_varying(fs, j);
-
-               if (j >= fs->inputs_count)
-                       break;
-
-               if (fs->inputs[j].inloc >= fs->total_in)
-                       continue;
-
-               k = ir3_find_output(vs, fs->inputs[j].slot);
-
-               ir3_link_add(l, vs->outputs[k].regid,
-                       fs->inputs[j].compmask, fs->inputs[j].inloc);
-       }
-}
-
-static inline uint32_t
-ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
-{
-       int j;
-       for (j = 0; j < so->outputs_count; j++)
-               if (so->outputs[j].slot == slot)
-                       return so->outputs[j].regid;
-       return regid(63, 0);
-}
-
-static inline uint32_t
-ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
-{
-       int j;
-       for (j = 0; j < so->inputs_count; j++)
-               if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
-                       return so->inputs[j].regid;
-       return regid(63, 0);
-}
-
-/* calculate register footprint in terms of half-regs (ie. one full
- * reg counts as two half-regs).
- */
-static inline uint32_t
-ir3_shader_halfregs(const struct ir3_shader_variant *v)
-{
-       return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
-}
-
-#endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build

index 797ba081758929e7b684f9e62b309ba9fc092370..f996126e3869031eff3a0ed998ea8de38ffac620 100644 (file)
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -18,18 +18,6 @@
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  # SOFTWARE.
  
-ir3_nir_trig_c = custom_target(
-  'ir3_nir_trig.c',
-  input : 'ir3/ir3_nir_trig.py',
-  output : 'ir3_nir_trig.c',
-  command : [
-    prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
-  ],
-  capture : true,
-  depend_files : nir_algebraic_py,
-)
-
  files_libfreedreno = files(
    'adreno_common.xml.h',
    'adreno_pm4.xml.h',
@@ -215,35 +203,15 @@ files_libfreedreno = files(
    'a6xx/fd6_texture.h',
    'a6xx/fd6_zsa.c',
    'a6xx/fd6_zsa.h',
-  'ir3/disasm-a3xx.c',
-  'ir3/instr-a3xx.h',
-  'ir3/ir3.c',
    'ir3/ir3_cache.c',
    'ir3/ir3_cache.h',
-  'ir3/ir3_compiler_nir.c',
-  'ir3/ir3_compiler.c',
-  'ir3/ir3_compiler.h',
-  'ir3/ir3_cp.c',
-  'ir3/ir3_depth.c',
    'ir3/ir3_gallium.c',
    'ir3/ir3_gallium.h',
-  'ir3/ir3_group.c',
-  'ir3/ir3.h',
-  'ir3/ir3_legalize.c',
-  'ir3/ir3_nir.c',
-  'ir3/ir3_nir.h',
-  'ir3/ir3_nir_lower_tg4_to_tex.c',
-  'ir3/ir3_print.c',
-  'ir3/ir3_ra.c',
-  'ir3/ir3_sched.c',
-  'ir3/ir3_shader.c',
-  'ir3/ir3_shader.h',
  )
  
  freedreno_includes = [
    inc_src, inc_include, inc_gallium, inc_gallium_aux,
-  inc_freedreno,
-  include_directories('ir3')
+  inc_freedreno, include_directories('ir3'),
  ]
  
  freedreno_c_args = []
@@ -258,7 +226,7 @@ endif
  
  libfreedreno = static_library(
    'freedreno',
-  [files_libfreedreno, ir3_nir_trig_c],
+  [files_libfreedreno],
    include_directories : freedreno_includes,
    c_args : [freedreno_c_args, c_vis_args],
    cpp_args : [freedreno_cpp_args, cpp_vis_args],
@@ -273,6 +241,7 @@ driver_freedreno = declare_dependency(
      libfreedrenowinsys,
      libfreedreno,
      libfreedreno_drm,
+    libfreedreno_ir3,
    ],
    dependencies : idep_nir,
  )
@@ -288,6 +257,7 @@ ir3_compiler = executable(
    link_with : [
      libfreedreno,
      libfreedreno_drm,
+    libfreedreno_ir3,
      libgallium,
      libglsl_standalone,
      libmesa_util,
author	Rob Clark <robdclark@gmail.com>
	Sat, 10 Nov 2018 17:05:59 +0000 (12:05 -0500)
committer	Rob Clark <robdclark@gmail.com>
	Tue, 27 Nov 2018 20:44:02 +0000 (15:44 -0500)
src/freedreno/Makefile.am		patch \| blob \| history
src/freedreno/Makefile.sources		patch \| blob \| history
src/freedreno/ir3/disasm-a3xx.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/instr-a3xx.h	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3.h	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_compiler.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_compiler.h	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_compiler_nir.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_cp.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_depth.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_group.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_legalize.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_nir.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_nir.h	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_nir_trig.py	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_print.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_ra.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_sched.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_shader.c	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/ir3_shader.h	[new file with mode: 0644]	patch \| blob
src/freedreno/ir3/meson.build	[new file with mode: 0644]	patch \| blob
src/freedreno/meson.build		patch \| blob \| history
src/gallium/drivers/freedreno/Automake.inc		patch \| blob \| history
src/gallium/drivers/freedreno/Makefile.am		patch \| blob \| history
src/gallium/drivers/freedreno/Makefile.sources		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_context.h		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_program.h		patch \| blob \| history
src/gallium/drivers/freedreno/a3xx/fd3_screen.c		patch \| blob \| history
src/gallium/drivers/freedreno/a4xx/fd4_context.h		patch \| blob \| history
src/gallium/drivers/freedreno/a4xx/fd4_program.h		patch \| blob \| history
src/gallium/drivers/freedreno/a4xx/fd4_screen.c		patch \| blob \| history
src/gallium/drivers/freedreno/a5xx/fd5_context.h		patch \| blob \| history
src/gallium/drivers/freedreno/a5xx/fd5_program.h		patch \| blob \| history
src/gallium/drivers/freedreno/a5xx/fd5_screen.c		patch \| blob \| history
src/gallium/drivers/freedreno/a6xx/fd6_context.h		patch \| blob \| history
src/gallium/drivers/freedreno/a6xx/fd6_program.h		patch \| blob \| history
src/gallium/drivers/freedreno/a6xx/fd6_screen.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/disasm-a3xx.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/instr-a3xx.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_cache.h		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_cmdline.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_compiler.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_compiler.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_cp.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_depth.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_gallium.c		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_gallium.h		patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_group.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_legalize.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_nir.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_nir.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_print.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_ra.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_sched.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_shader.c	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/ir3/ir3_shader.h	[deleted file]	patch \| blob \| history
src/gallium/drivers/freedreno/meson.build		patch \| blob \| history