pan/mdg: Add disassembly for shadow gathers
[mesa.git] / src / panfrost / midgard / disassemble.c
index 94a8166674bed0733872bafba76e3fe9844dafeb..e872ea9923cf5a6f99beab8362bb2ad0af36b5ef 100644 (file)
 #include <ctype.h>
 #include <string.h>
 #include "midgard.h"
-#include "midgard-parse.h"
 #include "midgard_ops.h"
+#include "midgard_quirks.h"
 #include "disassemble.h"
 #include "helpers.h"
+#include "util/bitscan.h"
 #include "util/half_float.h"
 #include "util/u_math.h"
 
-#define DEFINE_CASE(define, str) case define: { printf(str); break; }
+#define DEFINE_CASE(define, str) case define: { fprintf(fp, str); break; }
 
 static unsigned *midg_tags;
 static bool is_instruction_int = false;
@@ -48,54 +49,60 @@ static bool is_instruction_int = false;
 
 static struct midgard_disasm_stats midg_stats;
 
-/* Prints a short form of the tag for branching, the minimum needed to be
- * legible and unambiguous */
+/* Transform an expanded writemask (duplicated 8-bit format) into its condensed
+ * form (one bit per component) */
 
-static void
-print_tag_short(unsigned tag)
+static inline unsigned
+condense_writemask(unsigned expanded_mask,
+                   unsigned bits_per_component)
 {
-        switch (midgard_word_types[tag]) {
-        case midgard_word_type_texture:
-                printf("tex/%X", tag);
-                break;
+        if (bits_per_component == 8) {
+                /* Duplicate every bit to go from 8 to 16-channel wrmask */
+                unsigned omask = 0;
 
-        case midgard_word_type_load_store:
-                printf("ldst");
-                break;
+                for (unsigned i = 0; i < 8; ++i) {
+                        if (expanded_mask & (1 << i))
+                                omask |= (3 << (2 * i));
+                }
 
-        case midgard_word_type_alu:
-                printf("alu%u/%X", midgard_word_size[tag], tag);
-                break;
+                return omask;
+        }
 
-        default:
-                printf("%s%X", (tag > 0) ? "" : "unk", tag);
-                break;
+        unsigned slots_per_component = bits_per_component / 16;
+        unsigned max_comp = (16 * 8) / bits_per_component;
+        unsigned condensed_mask = 0;
+
+        for (unsigned i = 0; i < max_comp; i++) {
+                if (expanded_mask & (1 << (i * slots_per_component)))
+                        condensed_mask |= (1 << i);
         }
+
+        return condensed_mask;
 }
 
 static void
-print_alu_opcode(midgard_alu_op op)
+print_alu_opcode(FILE *fp, midgard_alu_op op)
 {
         bool int_op = false;
 
         if (alu_opcode_props[op].name) {
-                printf("%s", alu_opcode_props[op].name);
+                fprintf(fp, "%s", alu_opcode_props[op].name);
 
                 int_op = midgard_is_integer_op(op);
         } else
-                printf("alu_op_%02X", op);
+                fprintf(fp, "alu_op_%02X", op);
 
         /* For constant analysis */
         is_instruction_int = int_op;
 }
 
 static void
-print_ld_st_opcode(midgard_load_store_op op)
+print_ld_st_opcode(FILE *fp, midgard_load_store_op op)
 {
         if (load_store_opcode_props[op].name)
-                printf("%s", load_store_opcode_props[op].name);
+                fprintf(fp, "%s", load_store_opcode_props[op].name);
         else
-                printf("ldst_op_%02X", op);
+                fprintf(fp, "ldst_op_%02X", op);
 }
 
 static bool is_embedded_constant_half = false;
@@ -123,7 +130,7 @@ prefix_for_bits(unsigned bits)
 uint16_t midg_ever_written = 0;
 
 static void
-print_reg(unsigned reg, unsigned bits)
+print_reg(FILE *fp, unsigned reg, unsigned bits)
 {
         /* Perform basic static analysis for expanding constants correctly */
 
@@ -156,15 +163,15 @@ print_reg(unsigned reg, unsigned bits)
         char prefix = prefix_for_bits(bits);
 
         if (prefix)
-                putchar(prefix);
+                fputc(prefix, fp);
 
-        printf("r%u", reg);
+        fprintf(fp, "r%u", reg);
 }
 
 static char *outmod_names_float[4] = {
         "",
         ".pos",
-        ".unk2",
+        ".sat_signed",
         ".sat"
 };
 
@@ -183,103 +190,116 @@ static char *srcmod_names_int[4] = {
 };
 
 static void
-print_outmod(unsigned outmod, bool is_int)
+print_outmod(FILE *fp, unsigned outmod, bool is_int)
 {
-        printf("%s", is_int ? outmod_names_int[outmod] :
+        fprintf(fp, "%s", is_int ? outmod_names_int[outmod] :
                outmod_names_float[outmod]);
 }
 
 static void
-print_quad_word(uint32_t *words, unsigned tabs)
+print_quad_word(FILE *fp, uint32_t *words, unsigned tabs)
 {
         unsigned i;
 
         for (i = 0; i < 4; i++)
-                printf("0x%08X%s ", words[i], i == 3 ? "" : ",");
+                fprintf(fp, "0x%08X%s ", words[i], i == 3 ? "" : ",");
 
-        printf("\n");
+        fprintf(fp, "\n");
 }
 
 static const char components[16] = "xyzwefghijklmnop";
 
 /* Helper to print 4 chars of a swizzle */
 static void
-print_swizzle_helper(unsigned swizzle, bool upper)
+print_swizzle_helper(FILE *fp, unsigned swizzle, unsigned offset)
 {
         for (unsigned i = 0; i < 4; ++i) {
                 unsigned c = (swizzle >> (i * 2)) & 3;
-                c += upper*4;
-                printf("%c", components[c]);
+                c += offset;
+                fprintf(fp, "%c", components[c]);
         }
 }
 
 /* Helper to print 8 chars of a swizzle, duplicating over */
 static void
-print_swizzle_helper_8(unsigned swizzle, bool upper)
+print_swizzle_helper_8(FILE *fp, unsigned swizzle, bool upper)
 {
         for (unsigned i = 0; i < 4; ++i) {
                 unsigned c = (swizzle >> (i * 2)) & 3;
                 c *= 2;
                 c += upper*8;
-                printf("%c%c", components[c], components[c+1]);
+                fprintf(fp, "%c%c", components[c], components[c+1]);
         }
 }
 
 static void
-print_swizzle_vec16(unsigned swizzle, bool rep_high, bool rep_low,
+print_swizzle_vec16(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low,
                     midgard_dest_override override)
 {
-        printf(".");
+        fprintf(fp, ".");
 
         if (override == midgard_dest_override_upper) {
                 if (rep_high)
-                        printf(" /* rep_high */ ");
+                        fprintf(fp, " /* rep_high */ ");
                 if (rep_low)
-                        printf(" /* rep_low */ ");
+                        fprintf(fp, " /* rep_low */ ");
 
                 if (!rep_high && rep_low)
-                        print_swizzle_helper_8(swizzle, true);
+                        print_swizzle_helper_8(fp, swizzle, true);
                 else
-                        print_swizzle_helper_8(swizzle, false);
+                        print_swizzle_helper_8(fp, swizzle, false);
         } else {
-                print_swizzle_helper_8(swizzle, rep_high & 1);
-                print_swizzle_helper_8(swizzle, !(rep_low & 1));
+                print_swizzle_helper_8(fp, swizzle, rep_high & 1);
+                print_swizzle_helper_8(fp, swizzle, !(rep_low & 1));
         }
 }
 
 static void
-print_swizzle_vec8(unsigned swizzle, bool rep_high, bool rep_low)
+print_swizzle_vec8(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low, bool half)
 {
-        printf(".");
+        fprintf(fp, ".");
 
-        print_swizzle_helper(swizzle, rep_high & 1);
-        print_swizzle_helper(swizzle, !(rep_low & 1));
+        /* TODO: Is it possible to unify half/full? */
+
+        if (half) {
+                print_swizzle_helper(fp, swizzle, (rep_low * 8));
+                print_swizzle_helper(fp, swizzle, (rep_low * 8) + !rep_high * 4);
+        } else {
+                print_swizzle_helper(fp, swizzle, rep_high * 4);
+                print_swizzle_helper(fp, swizzle, !rep_low * 4);
+        }
 }
 
 static void
-print_swizzle_vec4(unsigned swizzle, bool rep_high, bool rep_low)
+print_swizzle_vec4(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low, bool half)
 {
         if (rep_high)
-                printf(" /* rep_high */ ");
-        if (rep_low)
-                printf(" /* rep_low */ ");
+                fprintf(fp, " /* rep_high */ ");
 
-        if (swizzle == 0xE4) return; /* xyzw */
+        if (!half && rep_low)
+                fprintf(fp, " /* rep_low */ ");
 
-        printf(".");
-        print_swizzle_helper(swizzle, 0);
+        if (swizzle == 0xE4 && !half) return; /* xyzw */
+
+        fprintf(fp, ".");
+        print_swizzle_helper(fp, swizzle, rep_low * 4);
 }
 static void
-print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low)
+print_swizzle_vec2(FILE *fp, unsigned swizzle, bool rep_high, bool rep_low, bool half)
 {
+        char *alphabet = "XY";
+
+        if (half) {
+                alphabet = rep_low ? "zw" : "xy";
+        } else if (rep_low)
+                fprintf(fp, " /* rep_low */ ");
+
         if (rep_high)
-                printf(" /* rep_high */ ");
-        if (rep_low)
-                printf(" /* rep_low */ ");
+                fprintf(fp, " /* rep_high */ ");
 
-        if (swizzle == 0xE4) return; /* XY */
+        if (swizzle == 0xE4 && !half) return; /* XY */
 
-        printf(".");
+        fprintf(fp, ".");
 
         for (unsigned i = 0; i < 4; i += 2) {
                 unsigned a = (swizzle >> (i * 2)) & 3;
@@ -288,14 +308,10 @@ print_swizzle_vec2(unsigned swizzle, bool rep_high, bool rep_low)
                 /* Normally we're adjacent, but if there's an issue, don't make
                  * it ambiguous */
 
-                if (a & 0x1)
-                        printf("[%c%c]", components[a], components[b]);
-                else if (a == b)
-                        printf("%c", components[a >> 1]);
-                else if (b == (a + 1))
-                        printf("%c", "XY"[a >> 1]);
+                if (b == (a + 1))
+                        fprintf(fp, "%c", alphabet[a >> 1]);
                 else
-                        printf("[%c%c]", components[a], components[b]);
+                        fprintf(fp, "[%c%c]", components[a], components[b]);
         }
 }
 
@@ -329,59 +345,144 @@ bits_for_mode_halved(midgard_reg_mode mode, bool half)
 }
 
 static void
-print_vector_src(unsigned src_binary,
-                 midgard_reg_mode mode, unsigned reg,
-                 midgard_dest_override override, bool is_int)
+print_scalar_constant(FILE *fp, unsigned src_binary,
+                      const midgard_constants *consts,
+                      midgard_scalar_alu *alu)
+{
+        midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
+        assert(consts != NULL);
+
+        fprintf(fp, "#");
+        mir_print_constant_component(fp, consts, src->component,
+                                     src->full ?
+                                     midgard_reg_mode_32 : midgard_reg_mode_16,
+                                     false, src->mod, alu->op);
+}
+
+static void
+print_vector_constants(FILE *fp, unsigned src_binary,
+                       const midgard_constants *consts,
+                       midgard_vector_alu *alu)
 {
         midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
+        unsigned bits = bits_for_mode_halved(alu->reg_mode, src->half);
+        unsigned max_comp = (sizeof(*consts) * 8) / bits;
+        unsigned comp_mask, num_comp = 0;
+
+        assert(consts);
+        assert(max_comp <= 16);
 
+        comp_mask = effective_writemask(alu->op, condense_writemask(alu->mask, bits));
+        num_comp = util_bitcount(comp_mask);
+
+        fprintf(fp, "<");
+        bool first = true;
+
+       for (unsigned i = 0; i < max_comp; ++i) {
+                if (!(comp_mask & (1 << i))) continue;
+
+                unsigned c = (src->swizzle >> (i * 2)) & 3;
+
+                if (bits == 16 && !src->half) {
+                        if (i < 4)
+                                c += (src->rep_high * 4);
+                        else
+                                c += (!src->rep_low * 4);
+                } else if (bits == 32 && !src->half) {
+                        /* Implicitly ok */
+                } else if (bits == 8) {
+                        assert (!src->half);
+                        unsigned index = (i >> 1) & 3;
+                        unsigned base = (src->swizzle >> (index * 2)) & 3;
+                        c = base * 2;
+
+                        if (i < 8)
+                                c += (src->rep_high) * 8;
+                        else
+                                c += (!src->rep_low) * 8;
+
+                        /* We work on twos, actually */
+                        if (i & 1)
+                                c++;
+                } else {
+                        printf(" (%d%d%d)", src->rep_low, src->rep_high, src->half);
+                }
+
+                if (first)
+                        first = false;
+                else
+                        fprintf(fp, ", ");
+
+                mir_print_constant_component(fp, consts, c, alu->reg_mode,
+                                             src->half, src->mod, alu->op);
+        }
+
+        if (num_comp > 1)
+                fprintf(fp, ">");
+}
+
+static void
+print_srcmod(FILE *fp, bool is_int, unsigned mod, bool scalar)
+{
         /* Modifiers change meaning depending on the op's context */
 
-        midgard_int_mod int_mod = src->mod;
+        midgard_int_mod int_mod = mod;
 
         if (is_int) {
-                printf("%s", srcmod_names_int[int_mod]);
+                if (scalar && mod == 2) {
+                        fprintf(fp, "unk2");
+                }
+
+                fprintf(fp, "%s", srcmod_names_int[int_mod]);
         } else {
-                if (src->mod & MIDGARD_FLOAT_MOD_NEG)
-                        printf("-");
+                if (mod & MIDGARD_FLOAT_MOD_NEG)
+                        fprintf(fp, "-");
 
-                if (src->mod & MIDGARD_FLOAT_MOD_ABS)
-                        printf("abs(");
+                if (mod & MIDGARD_FLOAT_MOD_ABS)
+                        fprintf(fp, "abs(");
         }
+}
+
+static void
+print_srcmod_end(FILE *fp, bool is_int, unsigned mod, unsigned bits)
+{
+        /* Since we wrapped with a function-looking thing */
+
+        if (is_int && mod == midgard_int_shift)
+                fprintf(fp, ") << %u", bits);
+        else if ((is_int && (mod != midgard_int_normal))
+                 || (!is_int && mod & MIDGARD_FLOAT_MOD_ABS))
+                fprintf(fp, ")");
+}
+
+static void
+print_vector_src(FILE *fp, unsigned src_binary,
+                 midgard_reg_mode mode, unsigned reg,
+                 midgard_dest_override override, bool is_int)
+{
+        midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
+        print_srcmod(fp, is_int, src->mod, false);
 
         //register
         unsigned bits = bits_for_mode_halved(mode, src->half);
-        print_reg(reg, bits);
-
-        //swizzle
-        if (bits == 16) {
-                /* When the mode of the instruction is itself 16-bit,
-                 * rep_low/high work more or less as expected. But if the mode
-                 * is 32-bit and we're stepping down, you only have vec4 and
-                 * the meaning shifts to rep_low as higher-half and rep_high is
-                 * never seen. TODO: are other modes similar? */
-
-                if (mode == midgard_reg_mode_32) {
-                        printf(".");
-                        print_swizzle_helper(src->swizzle, src->rep_low);
-                        assert(!src->rep_high);
-                } else {
-                        print_swizzle_vec8(src->swizzle, src->rep_high, src->rep_low);
-                }
-        } else if (bits == 8)
-                print_swizzle_vec16(src->swizzle, src->rep_high, src->rep_low, override);
-        else if (bits == 32)
-                print_swizzle_vec4(src->swizzle, src->rep_high, src->rep_low);
-        else if (bits == 64)
-                print_swizzle_vec2(src->swizzle, src->rep_high, src->rep_low);
-
-        /* Since we wrapped with a function-looking thing */
+        print_reg(fp, reg, bits);
+
+        /* When the source was stepped down via `half`, rep_low means "higher
+         * half" and rep_high is never seen. When it's not native,
+         * rep_low/rep_high are for, well, replication */
+
+        if (mode == midgard_reg_mode_8) {
+                assert(!src->half);
+                print_swizzle_vec16(fp, src->swizzle, src->rep_high, src->rep_low, override);
+        } else if (mode == midgard_reg_mode_16) {
+                print_swizzle_vec8(fp, src->swizzle, src->rep_high, src->rep_low, src->half);
+        } else if (mode == midgard_reg_mode_32) {
+                print_swizzle_vec4(fp, src->swizzle, src->rep_high, src->rep_low, src->half);
+        } else if (mode == midgard_reg_mode_64) {
+                print_swizzle_vec2(fp, src->swizzle, src->rep_high, src->rep_low, src->half);
+        }
 
-        if (is_int && int_mod == midgard_int_shift)
-                printf(") << %u", bits);
-        else if ((is_int && (int_mod != midgard_int_normal))
-                 || (!is_int && src->mod & MIDGARD_FLOAT_MOD_ABS))
-                printf(")");
+        print_srcmod_end(fp, is_int, src->mod, bits);
 }
 
 static uint16_t
@@ -395,12 +496,12 @@ decode_vector_imm(unsigned src2_reg, unsigned imm)
 }
 
 static void
-print_immediate(uint16_t imm)
+print_immediate(FILE *fp, uint16_t imm)
 {
         if (is_instruction_int)
-                printf("#%u", imm);
+                fprintf(fp, "#%u", imm);
         else
-                printf("#%g", _mesa_half_to_float(imm));
+                fprintf(fp, "#%g", _mesa_half_to_float(imm));
 }
 
 static void
@@ -416,7 +517,7 @@ update_dest(unsigned reg)
 }
 
 static void
-print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
+print_dest(FILE *fp, unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
 {
         /* Depending on the mode and override, we determine the type of
          * destination addressed. Absent an override, we address just the
@@ -428,17 +529,17 @@ print_dest(unsigned reg, midgard_reg_mode mode, midgard_dest_override override)
                 bits /= 2;
 
         update_dest(reg);
-        print_reg(reg, bits);
+        print_reg(fp, reg, bits);
 }
 
 static void
-print_mask_vec16(uint8_t mask, midgard_dest_override override)
+print_mask_vec16(FILE *fp, uint8_t mask, midgard_dest_override override)
 {
-        printf(".");
+        fprintf(fp, ".");
 
         for (unsigned i = 0; i < 8; i++) {
                 if (mask & (1 << i))
-                        printf("%c%c",
+                        fprintf(fp, "%c%c",
                                components[i*2 + 0],
                                components[i*2 + 1]);
         }
@@ -452,29 +553,19 @@ print_mask_vec16(uint8_t mask, midgard_dest_override override)
  * the mask to make it obvious what happened */
 
 static void
-print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
+print_mask(FILE *fp, uint8_t mask, unsigned bits, midgard_dest_override override)
 {
         if (bits == 8) {
-                print_mask_vec16(mask, override);
+                print_mask_vec16(fp, mask, override);
                 return;
         }
 
         /* Skip 'complete' masks */
 
-        if (override == midgard_dest_override_none) {
+        if (override == midgard_dest_override_none)
                 if (bits >= 32 && mask == 0xFF) return;
 
-                if (bits == 16) {
-                        if (mask == 0x0F)
-                                return;
-                        else if (mask == 0xF0) {
-                                printf("'");
-                                return;
-                        }
-                }
-        }
-
-        printf(".");
+        fprintf(fp, ".");
 
         unsigned skip = (bits / 16);
         bool uppercase = bits > 32;
@@ -486,10 +577,8 @@ print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
 
         const char *alphabet = components;
 
-        if (override == midgard_dest_override_upper) {
-                unsigned components = 128 / bits;
-                alphabet += components;
-        }
+        if (override == midgard_dest_override_upper)
+                alphabet += (128 / bits);
 
         for (unsigned i = 0; i < 8; i += skip) {
                 bool a = (mask & (1 << i)) != 0;
@@ -505,12 +594,12 @@ print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
                         if (uppercase)
                                 c = toupper(c);
 
-                        printf("%c", c);
+                        fprintf(fp, "%c", c);
                 }
         }
 
         if (tripped)
-                printf(" /* %X */", mask);
+                fprintf(fp, " /* %X */", mask);
 }
 
 /* Prints the 4-bit masks found in texture and load/store ops, as opposed to
@@ -518,27 +607,27 @@ print_mask(uint8_t mask, unsigned bits, midgard_dest_override override)
  * mode as well, but not load/store-style 16-bit mode. */
 
 static void
-print_mask_4(unsigned mask, bool upper)
+print_mask_4(FILE *fp, unsigned mask, bool upper)
 {
         if (mask == 0xF) {
                 if (upper)
-                        printf("'");
+                        fprintf(fp, "'");
 
                 return;
         }
 
-        printf(".");
+        fprintf(fp, ".");
 
         for (unsigned i = 0; i < 4; ++i) {
                 bool a = (mask & (1 << i)) != 0;
                 if (a)
-                        printf("%c", components[i + (upper ? 4 : 0)]);
+                        fprintf(fp, "%c", components[i + (upper ? 4 : 0)]);
         }
 }
 
 static void
-print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
-                   unsigned tabs)
+print_vector_field(FILE *fp, const char *name, uint16_t *words, uint16_t reg_word,
+                   const midgard_constants *consts, unsigned tabs)
 {
         midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
         midgard_vector_alu *alu_field = (midgard_vector_alu *) words;
@@ -547,70 +636,82 @@ print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
 
         /* For now, prefix instruction names with their unit, until we
          * understand how this works on a deeper level */
-        printf("%s.", name);
+        fprintf(fp, "%s.", name);
 
-        print_alu_opcode(alu_field->op);
+        print_alu_opcode(fp, alu_field->op);
 
         /* Postfix with the size to disambiguate if necessary */
         char postfix = prefix_for_bits(bits_for_mode(mode));
         bool size_ambiguous = override != midgard_dest_override_none;
 
         if (size_ambiguous)
-                printf("%c", postfix ? postfix : 'r');
+                fprintf(fp, "%c", postfix ? postfix : 'r');
 
         /* Print the outmod, if there is one */
-        print_outmod(alu_field->outmod,
+        print_outmod(fp, alu_field->outmod,
                      midgard_is_integer_out_op(alu_field->op));
 
-        printf(" ");
+        fprintf(fp, " ");
 
         /* Mask denoting status of 8-lanes */
         uint8_t mask = alu_field->mask;
 
         /* First, print the destination */
-        print_dest(reg_info->out_reg, mode, alu_field->dest_override);
+        print_dest(fp, reg_info->out_reg, mode, alu_field->dest_override);
 
         if (override != midgard_dest_override_none) {
                 bool modeable = (mode != midgard_reg_mode_8);
                 bool known = override != 0x3; /* Unused value */
 
                 if (!(modeable && known))
-                        printf("/* do%u */ ", override);
+                        fprintf(fp, "/* do%u */ ", override);
         }
 
-        print_mask(mask, bits_for_mode(mode), override);
+        /* Instructions like fdot4 do *not* replicate, ensure the
+         * mask is of only a single component */
+
+        unsigned rep = GET_CHANNEL_COUNT(alu_opcode_props[alu_field->op].props);
 
-        printf(", ");
+        if (rep) {
+                unsigned comp_mask = condense_writemask(mask, bits_for_mode(mode));
+                unsigned num_comp = util_bitcount(comp_mask);
+                if (num_comp != 1)
+                        fprintf(fp, "/* err too many components */");
+        }
+        print_mask(fp, mask, bits_for_mode(mode), override);
+
+        fprintf(fp, ", ");
 
         bool is_int = midgard_is_integer_op(alu_field->op);
-        print_vector_src(alu_field->src1, mode, reg_info->src1_reg, override, is_int);
 
-        printf(", ");
+        if (reg_info->src1_reg == 26)
+                print_vector_constants(fp, alu_field->src1, consts, alu_field);
+        else
+                print_vector_src(fp, alu_field->src1, mode, reg_info->src1_reg, override, is_int);
+
+        fprintf(fp, ", ");
 
         if (reg_info->src2_imm) {
                 uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2);
-                print_immediate(imm);
+                print_immediate(fp, imm);
+        } else if (reg_info->src2_reg == 26) {
+                print_vector_constants(fp, alu_field->src2, consts, alu_field);
         } else {
-                print_vector_src(alu_field->src2, mode,
+                print_vector_src(fp, alu_field->src2, mode,
                                  reg_info->src2_reg, override, is_int);
         }
 
         midg_stats.instruction_count++;
-        printf("\n");
+        fprintf(fp, "\n");
 }
 
 static void
-print_scalar_src(unsigned src_binary, unsigned reg)
+print_scalar_src(FILE *fp, bool is_int, unsigned src_binary, unsigned reg)
 {
         midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
 
-        if (src->negate)
-                printf("-");
-
-        if (src->abs)
-                printf("abs(");
-
-        print_reg(reg, src->full ? 32 : 16);
+        print_srcmod(fp, is_int, src->mod, true);
+        print_reg(fp, reg, src->full ? 32 : 16);
 
         unsigned c = src->component;
 
@@ -619,11 +720,9 @@ print_scalar_src(unsigned src_binary, unsigned reg)
                 c >>= 1;
         }
 
-        printf(".%c", components[c]);
-
-        if (src->abs)
-                printf(")");
+        fprintf(fp, ".%c", components[c]);
 
+        print_srcmod_end(fp, is_int, src->mod, src->full ? 32 : 16);
 }
 
 static uint16_t
@@ -639,126 +738,133 @@ decode_scalar_imm(unsigned src2_reg, unsigned imm)
 }
 
 static void
-print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word,
-                   unsigned tabs)
+print_scalar_field(FILE *fp, const char *name, uint16_t *words, uint16_t reg_word,
+                   const midgard_constants *consts, unsigned tabs)
 {
         midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
         midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words;
 
         if (alu_field->unknown)
-                printf("scalar ALU unknown bit set\n");
+                fprintf(fp, "scalar ALU unknown bit set\n");
 
-        printf("%s.", name);
-        print_alu_opcode(alu_field->op);
-        print_outmod(alu_field->outmod,
+        fprintf(fp, "%s.", name);
+        print_alu_opcode(fp, alu_field->op);
+        print_outmod(fp, alu_field->outmod,
                      midgard_is_integer_out_op(alu_field->op));
-        printf(" ");
+        fprintf(fp, " ");
 
         bool full = alu_field->output_full;
         update_dest(reg_info->out_reg);
-        print_reg(reg_info->out_reg, full ? 32 : 16);
+        print_reg(fp, reg_info->out_reg, full ? 32 : 16);
         unsigned c = alu_field->output_component;
+        bool is_int = midgard_is_integer_op(alu_field->op);
 
         if (full) {
                 assert((c & 1) == 0);
                 c >>= 1;
         }
 
-        printf(".%c, ", components[c]);
+        fprintf(fp, ".%c, ", components[c]);
 
-        print_scalar_src(alu_field->src1, reg_info->src1_reg);
+        if (reg_info->src1_reg == 26)
+                print_scalar_constant(fp, alu_field->src1, consts, alu_field);
+        else
+                print_scalar_src(fp, is_int, alu_field->src1, reg_info->src1_reg);
 
-        printf(", ");
+        fprintf(fp, ", ");
 
         if (reg_info->src2_imm) {
                 uint16_t imm = decode_scalar_imm(reg_info->src2_reg,
                                                  alu_field->src2);
-                print_immediate(imm);
+                print_immediate(fp, imm);
+       } else if (reg_info->src2_reg == 26) {
+                print_scalar_constant(fp, alu_field->src2, consts, alu_field);
         } else
-                print_scalar_src(alu_field->src2, reg_info->src2_reg);
+                print_scalar_src(fp, is_int, alu_field->src2, reg_info->src2_reg);
 
         midg_stats.instruction_count++;
-        printf("\n");
+        fprintf(fp, "\n");
 }
 
 static void
-print_branch_op(unsigned op)
+print_branch_op(FILE *fp, unsigned op)
 {
         switch (op) {
         case midgard_jmp_writeout_op_branch_uncond:
-                printf("uncond.");
+                fprintf(fp, "uncond.");
                 break;
 
         case midgard_jmp_writeout_op_branch_cond:
-                printf("cond.");
+                fprintf(fp, "cond.");
                 break;
 
         case midgard_jmp_writeout_op_writeout:
-                printf("write.");
+                fprintf(fp, "write.");
                 break;
 
         case midgard_jmp_writeout_op_tilebuffer_pending:
-                printf("tilebuffer.");
+                fprintf(fp, "tilebuffer.");
                 break;
 
         case midgard_jmp_writeout_op_discard:
-                printf("discard.");
+                fprintf(fp, "discard.");
                 break;
 
         default:
-                printf("unk%u.", op);
+                fprintf(fp, "unk%u.", op);
                 break;
         }
 }
 
 static void
-print_branch_cond(int cond)
+print_branch_cond(FILE *fp, int cond)
 {
         switch (cond) {
         case midgard_condition_write0:
-                printf("write0");
+                fprintf(fp, "write0");
                 break;
 
         case midgard_condition_false:
-                printf("false");
+                fprintf(fp, "false");
                 break;
 
         case midgard_condition_true:
-                printf("true");
+                fprintf(fp, "true");
                 break;
 
         case midgard_condition_always:
-                printf("always");
+                fprintf(fp, "always");
                 break;
 
         default:
-                printf("unk%X", cond);
+                fprintf(fp, "unk%X", cond);
                 break;
         }
 }
 
-static void
-print_compact_branch_writeout_field(uint16_t word)
+static bool
+print_compact_branch_writeout_field(FILE *fp, uint16_t word)
 {
         midgard_jmp_writeout_op op = word & 0x7;
+        midg_stats.instruction_count++;
 
         switch (op) {
         case midgard_jmp_writeout_op_branch_uncond: {
                 midgard_branch_uncond br_uncond;
                 memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond));
-                printf("br.uncond ");
+                fprintf(fp, "br.uncond ");
 
                 if (br_uncond.unknown != 1)
-                        printf("unknown:%u, ", br_uncond.unknown);
+                        fprintf(fp, "unknown:%u, ", br_uncond.unknown);
 
                 if (br_uncond.offset >= 0)
-                        printf("+");
+                        fprintf(fp, "+");
 
-                printf("%d -> ", br_uncond.offset);
-                print_tag_short(br_uncond.dest_tag);
-                printf("\n");
+                fprintf(fp, "%d -> %s", br_uncond.offset,
+                                midgard_tag_props[br_uncond.dest_tag].name);
+                fprintf(fp, "\n");
 
-                break;
+                return br_uncond.offset >= 0;
         }
 
         case midgard_jmp_writeout_op_branch_cond:
@@ -768,36 +874,36 @@ print_compact_branch_writeout_field(uint16_t word)
                 midgard_branch_cond br_cond;
                 memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond));
 
-                printf("br.");
+                fprintf(fp, "br.");
 
-                print_branch_op(br_cond.op);
-                print_branch_cond(br_cond.cond);
+                print_branch_op(fp, br_cond.op);
+                print_branch_cond(fp, br_cond.cond);
 
-                printf(" ");
+                fprintf(fp, " ");
 
                 if (br_cond.offset >= 0)
-                        printf("+");
+                        fprintf(fp, "+");
 
-                printf("%d -> ", br_cond.offset);
-                print_tag_short(br_cond.dest_tag);
-                printf("\n");
+                fprintf(fp, "%d -> %s", br_cond.offset,
+                                midgard_tag_props[br_cond.dest_tag].name);
+                fprintf(fp, "\n");
 
-                break;
+                return br_cond.offset >= 0;
         }
         }
 
-        midg_stats.instruction_count++;
+        return false;
 }
 
-static void
-print_extended_branch_writeout_field(uint8_t *words, unsigned next)
+static bool
+print_extended_branch_writeout_field(FILE *fp, uint8_t *words, unsigned next)
 {
         midgard_branch_extended br;
         memcpy((char *) &br, (char *) words, sizeof(br));
 
-        printf("brx.");
+        fprintf(fp, "brx.");
 
-        print_branch_op(br.op);
+        print_branch_op(fp, br.op);
 
         /* Condition codes are a LUT in the general case, but simply repeated 8 times for single-channel conditions.. Check this. */
 
@@ -808,35 +914,33 @@ print_extended_branch_writeout_field(uint8_t *words, unsigned next)
         }
 
         if (single_channel)
-                print_branch_cond(br.cond & 0x3);
+                print_branch_cond(fp, br.cond & 0x3);
         else
-                printf("lut%X", br.cond);
+                fprintf(fp, "lut%X", br.cond);
 
         if (br.unknown)
-                printf(".unknown%u", br.unknown);
+                fprintf(fp, ".unknown%u", br.unknown);
 
-        printf(" ");
+        fprintf(fp, " ");
 
         if (br.offset >= 0)
-                printf("+");
+                fprintf(fp, "+");
 
-        printf("%d -> ", br.offset);
-        print_tag_short(br.dest_tag);
-        printf("\n");
+        fprintf(fp, "%d -> %s\n", br.offset,
+                        midgard_tag_props[br.dest_tag].name);
 
         unsigned I = next + br.offset * 4;
 
         if (midg_tags[I] && midg_tags[I] != br.dest_tag) {
-                printf("\t/* XXX TAG ERROR: jumping to ");
-                print_tag_short(br.dest_tag);
-                printf(" but tagged ");
-                print_tag_short(midg_tags[I]);
-                printf(" */\n");
+                fprintf(fp, "\t/* XXX TAG ERROR: jumping to %s but tagged %s \n",
+                        midgard_tag_props[br.dest_tag].name,
+                        midgard_tag_props[midg_tags[I]].name);
         }
 
         midg_tags[I] = br.dest_tag;
 
         midg_stats.instruction_count++;
+        return br.offset >= 0;
 }
 
 static unsigned
@@ -862,20 +966,8 @@ num_alu_fields_enabled(uint32_t control_word)
         return ret;
 }
 
-static float
-float_bitcast(uint32_t integer)
-{
-        union {
-                uint32_t i;
-                float f;
-        } v;
-
-        v.i = integer;
-        return v.f;
-}
-
-static void
-print_alu_word(uint32_t *words, unsigned num_quad_words,
+static bool
+print_alu_word(FILE *fp, uint32_t *words, unsigned num_quad_words,
                unsigned tabs, unsigned next)
 {
         uint32_t control_word = words[0];
@@ -883,112 +975,100 @@ print_alu_word(uint32_t *words, unsigned num_quad_words,
         unsigned num_fields = num_alu_fields_enabled(control_word);
         uint16_t *word_ptr = beginning_ptr + num_fields;
         unsigned num_words = 2 + num_fields;
+        const midgard_constants *consts = NULL;
+        bool branch_forward = false;
+
+        if ((control_word >> 17) & 1)
+                num_words += 3;
+
+        if ((control_word >> 19) & 1)
+                num_words += 2;
+
+        if ((control_word >> 21) & 1)
+                num_words += 3;
+
+        if ((control_word >> 23) & 1)
+                num_words += 2;
+
+        if ((control_word >> 25) & 1)
+                num_words += 3;
+
+        if ((control_word >> 26) & 1)
+                num_words += 1;
+
+        if ((control_word >> 27) & 1)
+                num_words += 3;
+
+        if (num_quad_words > (num_words + 7) / 8) {
+                assert(num_quad_words == (num_words + 15) / 8);
+                //Assume that the extra quadword is constants
+                consts = (midgard_constants *)(words + (4 * num_quad_words - 4));
+        }
 
         if ((control_word >> 16) & 1)
-                printf("unknown bit 16 enabled\n");
+                fprintf(fp, "unknown bit 16 enabled\n");
 
         if ((control_word >> 17) & 1) {
-                print_vector_field("vmul", word_ptr, *beginning_ptr, tabs);
+                print_vector_field(fp, "vmul", word_ptr, *beginning_ptr, consts, tabs);
                 beginning_ptr += 1;
                 word_ptr += 3;
-                num_words += 3;
         }
 
         if ((control_word >> 18) & 1)
-                printf("unknown bit 18 enabled\n");
+                fprintf(fp, "unknown bit 18 enabled\n");
 
         if ((control_word >> 19) & 1) {
-                print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs);
+                print_scalar_field(fp, "sadd", word_ptr, *beginning_ptr, consts, tabs);
                 beginning_ptr += 1;
                 word_ptr += 2;
-                num_words += 2;
         }
 
         if ((control_word >> 20) & 1)
-                printf("unknown bit 20 enabled\n");
+                fprintf(fp, "unknown bit 20 enabled\n");
 
         if ((control_word >> 21) & 1) {
-                print_vector_field("vadd", word_ptr, *beginning_ptr, tabs);
+                print_vector_field(fp, "vadd", word_ptr, *beginning_ptr, consts, tabs);
                 beginning_ptr += 1;
                 word_ptr += 3;
-                num_words += 3;
         }
 
         if ((control_word >> 22) & 1)
-                printf("unknown bit 22 enabled\n");
+                fprintf(fp, "unknown bit 22 enabled\n");
 
         if ((control_word >> 23) & 1) {
-                print_scalar_field("smul", word_ptr, *beginning_ptr, tabs);
+                print_scalar_field(fp, "smul", word_ptr, *beginning_ptr, consts, tabs);
                 beginning_ptr += 1;
                 word_ptr += 2;
-                num_words += 2;
         }
 
         if ((control_word >> 24) & 1)
-                printf("unknown bit 24 enabled\n");
+                fprintf(fp, "unknown bit 24 enabled\n");
 
         if ((control_word >> 25) & 1) {
-                print_vector_field("lut", word_ptr, *beginning_ptr, tabs);
+                print_vector_field(fp, "lut", word_ptr, *beginning_ptr, consts, tabs);
                 word_ptr += 3;
-                num_words += 3;
         }
 
         if ((control_word >> 26) & 1) {
-                print_compact_branch_writeout_field(*word_ptr);
+                branch_forward |= print_compact_branch_writeout_field(fp, *word_ptr);
                 word_ptr += 1;
-                num_words += 1;
         }
 
         if ((control_word >> 27) & 1) {
-                print_extended_branch_writeout_field((uint8_t *) word_ptr, next);
+                branch_forward |= print_extended_branch_writeout_field(fp, (uint8_t *) word_ptr, next);
                 word_ptr += 3;
-                num_words += 3;
         }
 
-        if (num_quad_words > (num_words + 7) / 8) {
-                assert(num_quad_words == (num_words + 15) / 8);
-                //Assume that the extra quadword is constants
-                void *consts = words + (4 * num_quad_words - 4);
-
-                if (is_embedded_constant_int) {
-                        if (is_embedded_constant_half) {
-                                int16_t *sconsts = (int16_t *) consts;
-                                printf("sconstants %d, %d, %d, %d\n",
-                                       sconsts[0],
-                                       sconsts[1],
-                                       sconsts[2],
-                                       sconsts[3]);
-                        } else {
-                                uint32_t *iconsts = (uint32_t *) consts;
-                                printf("iconstants 0x%X, 0x%X, 0x%X, 0x%X\n",
-                                       iconsts[0],
-                                       iconsts[1],
-                                       iconsts[2],
-                                       iconsts[3]);
-                        }
-                } else {
-                        if (is_embedded_constant_half) {
-                                uint16_t *hconsts = (uint16_t *) consts;
-                                printf("hconstants %g, %g, %g, %g\n",
-                                       _mesa_half_to_float(hconsts[0]),
-                                       _mesa_half_to_float(hconsts[1]),
-                                       _mesa_half_to_float(hconsts[2]),
-                                       _mesa_half_to_float(hconsts[3]));
-                        } else {
-                                uint32_t *fconsts = (uint32_t *) consts;
-                                printf("fconstants %g, %g, %g, %g\n",
-                                       float_bitcast(fconsts[0]),
-                                       float_bitcast(fconsts[1]),
-                                       float_bitcast(fconsts[2]),
-                                       float_bitcast(fconsts[3]));
-                        }
+        if (consts)
+                fprintf(fp, "uconstants 0x%X, 0x%X, 0x%X, 0x%X\n",
+                        consts->u32[0], consts->u32[1],
+                        consts->u32[2], consts->u32[3]);
 
-                }
-        }
+        return branch_forward;
 }
 
 static void
-print_varying_parameters(midgard_load_store_word *word)
+print_varying_parameters(FILE *fp, midgard_load_store_word *word)
 {
         midgard_varying_parameter param;
         unsigned v = word->varying_parameters;
@@ -997,29 +1077,31 @@ print_varying_parameters(midgard_load_store_word *word)
         if (param.is_varying) {
                 /* If a varying, there are qualifiers */
                 if (param.flat)
-                        printf(".flat");
+                        fprintf(fp, ".flat");
 
                 if (param.interpolation != midgard_interp_default) {
                         if (param.interpolation == midgard_interp_centroid)
-                                printf(".centroid");
+                                fprintf(fp, ".centroid");
+                        else if (param.interpolation == midgard_interp_sample)
+                                fprintf(fp, ".sample");
                         else
-                                printf(".interp%d", param.interpolation);
+                                fprintf(fp, ".interp%d", param.interpolation);
                 }
 
                 if (param.modifier != midgard_varying_mod_none) {
                         if (param.modifier == midgard_varying_mod_perspective_w)
-                                printf(".perspectivew");
+                                fprintf(fp, ".perspectivew");
                         else if (param.modifier == midgard_varying_mod_perspective_z)
-                                printf(".perspectivez");
+                                fprintf(fp, ".perspectivez");
                         else
-                                printf(".mod%d", param.modifier);
+                                fprintf(fp, ".mod%d", param.modifier);
                 }
         } else if (param.flat || param.interpolation || param.modifier) {
-                printf(" /* is_varying not set but varying metadata attached */");
+                fprintf(fp, " /* is_varying not set but varying metadata attached */");
         }
 
         if (param.zero0 || param.zero1 || param.zero2)
-                printf(" /* zero tripped, %u %u %u */ ", param.zero0, param.zero1, param.zero2);
+                fprintf(fp, " /* zero tripped, %u %u %u */ ", param.zero0, param.zero1, param.zero2);
 }
 
 static bool
@@ -1055,7 +1137,7 @@ is_op_attribute(unsigned op)
 }
 
 static void
-print_load_store_arg(uint8_t arg, unsigned index)
+print_load_store_arg(FILE *fp, uint8_t arg, unsigned index)
 {
         /* Try to interpret as a register */
         midgard_ldst_register_select sel;
@@ -1065,23 +1147,23 @@ print_load_store_arg(uint8_t arg, unsigned index)
          * interpret it. But if it's zero, we get it. */
 
         if (sel.unknown) {
-                printf("0x%02X", arg);
+                fprintf(fp, "0x%02X", arg);
                 return;
         }
 
         unsigned reg = REGISTER_LDST_BASE + sel.select;
         char comp = components[sel.component];
 
-        printf("r%u.%c", reg, comp);
+        fprintf(fp, "r%u.%c", reg, comp);
 
         /* Only print a shift if it's non-zero. Shifts only make sense for the
          * second index. For the first, we're not sure what it means yet */
 
         if (index == 1) {
                 if (sel.shift)
-                        printf(" << %u", sel.shift);
+                        fprintf(fp, " << %u", sel.shift);
         } else {
-                printf(" /* %X */", sel.shift);
+                fprintf(fp, " /* %X */", sel.shift);
         }
 }
 
@@ -1093,17 +1175,17 @@ update_stats(signed *stat, unsigned address)
 }
 
 static void
-print_load_store_instr(uint64_t data,
+print_load_store_instr(FILE *fp, uint64_t data,
                        unsigned tabs)
 {
         midgard_load_store_word *word = (midgard_load_store_word *) &data;
 
-        print_ld_st_opcode(word->op);
+        print_ld_st_opcode(fp, word->op);
 
         unsigned address = word->address;
 
         if (is_op_varying(word->op)) {
-                print_varying_parameters(word);
+                print_varying_parameters(fp, word);
 
                 /* Do some analysis: check if direct cacess */
 
@@ -1118,8 +1200,8 @@ print_load_store_instr(uint64_t data,
                         midg_stats.attribute_count = -16;
         }
 
-        printf(" r%u", word->reg);
-        print_mask_4(word->mask, false);
+        fprintf(fp, " r%u", word->reg + (OP_IS_STORE(word->op) ? 26 : 0));
+        print_mask_4(fp, word->mask, false);
 
         if (!OP_IS_STORE(word->op))
                 update_dest(word->reg);
@@ -1136,49 +1218,49 @@ print_load_store_instr(uint64_t data,
                 address = (hi << 3) | lo;
         }
 
-        printf(", %u", address);
+        fprintf(fp, ", %u", address);
 
-        print_swizzle_vec4(word->swizzle, false, false);
+        print_swizzle_vec4(fp, word->swizzle, false, false, false);
 
-        printf(", ");
+        fprintf(fp, ", ");
 
         if (is_ubo) {
-                printf("ubo%u", word->arg_1);
+                fprintf(fp, "ubo%u", word->arg_1);
                 update_stats(&midg_stats.uniform_buffer_count, word->arg_1);
         } else
-                print_load_store_arg(word->arg_1, 0);
+                print_load_store_arg(fp, word->arg_1, 0);
 
-        printf(", ");
-        print_load_store_arg(word->arg_2, 1);
-        printf(" /* %X */\n", word->varying_parameters);
+        fprintf(fp, ", ");
+        print_load_store_arg(fp, word->arg_2, 1);
+        fprintf(fp, " /* %X */\n", word->varying_parameters);
 
         midg_stats.instruction_count++;
 }
 
 static void
-print_load_store_word(uint32_t *word, unsigned tabs)
+print_load_store_word(FILE *fp, uint32_t *word, unsigned tabs)
 {
         midgard_load_store *load_store = (midgard_load_store *) word;
 
         if (load_store->word1 != 3) {
-                print_load_store_instr(load_store->word1, tabs);
+                print_load_store_instr(fp, load_store->word1, tabs);
         }
 
         if (load_store->word2 != 3) {
-                print_load_store_instr(load_store->word2, tabs);
+                print_load_store_instr(fp, load_store->word2, tabs);
         }
 }
 
 static void
-print_texture_reg_select(uint8_t u, unsigned base)
+print_texture_reg_select(FILE *fp, uint8_t u, unsigned base)
 {
         midgard_tex_register_select sel;
         memcpy(&sel, &u, sizeof(u));
 
         if (!sel.full)
-                printf("h");
+                fprintf(fp, "h");
 
-        printf("r%u", base + sel.select);
+        fprintf(fp, "r%u", base + sel.select);
 
         unsigned component = sel.component;
 
@@ -1188,22 +1270,22 @@ print_texture_reg_select(uint8_t u, unsigned base)
                 component += 4;
         }
 
-        printf(".%c", components[component]);
+        fprintf(fp, ".%c", components[component]);
 
         assert(sel.zero == 0);
 }
 
 static void
-print_texture_format(int format)
+print_texture_format(FILE *fp, int format)
 {
         /* Act like a modifier */
-        printf(".");
+        fprintf(fp, ".");
 
         switch (format) {
-                DEFINE_CASE(MALI_TEX_1D, "1d");
-                DEFINE_CASE(MALI_TEX_2D, "2d");
-                DEFINE_CASE(MALI_TEX_3D, "3d");
-                DEFINE_CASE(MALI_TEX_CUBE, "cube");
+                DEFINE_CASE(1, "1d");
+                DEFINE_CASE(2, "2d");
+                DEFINE_CASE(3, "3d");
+                DEFINE_CASE(0, "cube");
 
         default:
                 unreachable("Bad format");
@@ -1211,15 +1293,11 @@ print_texture_format(int format)
 }
 
 static bool
-midgard_op_has_helpers(unsigned op, bool gather)
+midgard_op_has_helpers(unsigned op)
 {
-        if (gather)
-                return true;
-
         switch (op) {
         case TEXTURE_OP_NORMAL:
-        case TEXTURE_OP_DFDX:
-        case TEXTURE_OP_DFDY:
+        case TEXTURE_OP_DERIVATIVE:
                 return true;
         default:
                 return false;
@@ -1227,32 +1305,17 @@ midgard_op_has_helpers(unsigned op, bool gather)
 }
 
 static void
-print_texture_op(unsigned op, bool gather)
+print_texture_op(FILE *fp, unsigned op)
 {
-        /* Act like a bare name, like ESSL functions */
-
-        if (gather) {
-                printf("textureGather");
-
-                unsigned component = op >> 4;
-                unsigned bottom = op & 0xF;
-
-                if (bottom != 0x2)
-                        printf("_unk%u", bottom);
-
-                printf(".%c", components[component]);
-                return;
-        }
-
         switch (op) {
                 DEFINE_CASE(TEXTURE_OP_NORMAL, "texture");
                 DEFINE_CASE(TEXTURE_OP_LOD, "textureLod");
                 DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelFetch");
-                DEFINE_CASE(TEXTURE_OP_DFDX, "dFdx");
-                DEFINE_CASE(TEXTURE_OP_DFDY, "dFdy");
+                DEFINE_CASE(TEXTURE_OP_BARRIER, "barrier");
+                DEFINE_CASE(TEXTURE_OP_DERIVATIVE, "derivative");
 
         default:
-                printf("tex_%X", op);
+                fprintf(fp, "tex_%X", op);
                 break;
         }
 }
@@ -1279,80 +1342,152 @@ sampler_type_name(enum mali_sampler_type t)
 
 }
 
+static void
+print_texture_barrier(FILE *fp, uint32_t *word)
+{
+        midgard_texture_barrier_word *barrier = (midgard_texture_barrier_word *) word;
+
+        if (barrier->type != TAG_TEXTURE_4_BARRIER)
+                fprintf(fp, "/* barrier tag %X != tex/bar */ ", barrier->type);
+
+        if (!barrier->cont)
+                fprintf(fp, "/* cont missing? */");
+
+        if (!barrier->last)
+                fprintf(fp, "/* last missing? */");
+
+        if (barrier->zero1)
+                fprintf(fp, "/* zero1 = 0x%X */ ", barrier->zero1);
+
+        if (barrier->zero2)
+                fprintf(fp, "/* zero2 = 0x%X */ ", barrier->zero2);
+
+        if (barrier->zero3)
+                fprintf(fp, "/* zero3 = 0x%X */ ", barrier->zero3);
+
+        if (barrier->zero4)
+                fprintf(fp, "/* zero4 = 0x%X */ ", barrier->zero4);
+
+        if (barrier->zero5)
+                fprintf(fp, "/* zero4 = 0x%" PRIx64 " */ ", barrier->zero5);
+
+        if (barrier->out_of_order)
+                fprintf(fp, ".ooo%u", barrier->out_of_order);
+
+        fprintf(fp, "\n");
+}
+
 #undef DEFINE_CASE
 
+static const char *
+texture_mode(enum mali_texture_mode mode)
+{
+        switch (mode) {
+        case TEXTURE_NORMAL: return "";
+        case TEXTURE_SHADOW: return ".shadow";
+        case TEXTURE_GATHER_SHADOW: return ".gather.shadow";
+        case TEXTURE_GATHER_X: return ".gatherX";
+        case TEXTURE_GATHER_Y: return ".gatherY";
+        case TEXTURE_GATHER_Z: return ".gatherZ";
+        case TEXTURE_GATHER_W: return ".gatherW";
+        default: return "unk";
+        }
+}
+
+static const char *
+derivative_mode(enum mali_derivative_mode mode)
+{
+        switch (mode) {
+        case TEXTURE_DFDX: return ".x";
+        case TEXTURE_DFDY: return ".y";
+        default: return "unk";
+        }
+}
+
 static void
-print_texture_word(uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned out_reg_base)
+print_texture_word(FILE *fp, uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned out_reg_base)
 {
         midgard_texture_word *texture = (midgard_texture_word *) word;
-
-        midg_stats.helper_invocations |=
-                midgard_op_has_helpers(texture->op, texture->is_gather);
+        midg_stats.helper_invocations |= midgard_op_has_helpers(texture->op);
 
         /* Broad category of texture operation in question */
-        print_texture_op(texture->op, texture->is_gather);
+        print_texture_op(fp, texture->op);
+
+        /* Barriers use a dramatically different code path */
+        if (texture->op == TEXTURE_OP_BARRIER) {
+                print_texture_barrier(fp, word);
+                return;
+        } else if (texture->type == TAG_TEXTURE_4_BARRIER)
+                fprintf (fp, "/* nonbarrier had tex/bar tag */ ");
+        else if (texture->type == TAG_TEXTURE_4_VTX)
+                fprintf (fp, ".vtx");
+
+        if (texture->op == TEXTURE_OP_DERIVATIVE)
+                fprintf(fp, "%s", derivative_mode(texture->mode));
+        else
+                fprintf(fp, "%s", texture_mode(texture->mode));
 
         /* Specific format in question */
-        print_texture_format(texture->format);
+        print_texture_format(fp, texture->format);
 
         /* Instruction "modifiers" parallel the ALU instructions. */
 
-        if (texture->shadow)
-                printf(".shadow");
-
         if (texture->cont)
-                printf(".cont");
+                fprintf(fp, ".cont");
 
         if (texture->last)
-                printf(".last");
+                fprintf(fp, ".last");
+
+        if (texture->out_of_order)
+                fprintf(fp, ".ooo%u", texture->out_of_order);
 
         /* Output modifiers are always interpreted floatly */
-        print_outmod(texture->outmod, false);
+        print_outmod(fp, texture->outmod, false);
 
-        printf(" %sr%d", texture->out_full ? "" : "h",
+        fprintf(fp, " %sr%u", texture->out_full ? "" : "h",
                         out_reg_base + texture->out_reg_select);
-        print_mask_4(texture->mask, texture->out_upper);
+        print_mask_4(fp, texture->mask, texture->out_upper);
         assert(!(texture->out_full && texture->out_upper));
-        printf(", ");
+        fprintf(fp, ", ");
 
         /* Depending on whether we read from textures directly or indirectly,
          * we may be able to update our analysis */
 
         if (texture->texture_register) {
-                printf("texture[");
-                print_texture_reg_select(texture->texture_handle, in_reg_base);
-                printf("], ");
+                fprintf(fp, "texture[");
+                print_texture_reg_select(fp, texture->texture_handle, in_reg_base);
+                fprintf(fp, "], ");
 
                 /* Indirect, tut tut */
                 midg_stats.texture_count = -16;
         } else {
-                printf("texture%u, ", texture->texture_handle);
+                fprintf(fp, "texture%u, ", texture->texture_handle);
                 update_stats(&midg_stats.texture_count, texture->texture_handle);
         }
 
         /* Print the type, GL style */
-        printf("%csampler", sampler_type_name(texture->sampler_type));
+        fprintf(fp, "%csampler", sampler_type_name(texture->sampler_type));
 
         if (texture->sampler_register) {
-                printf("[");
-                print_texture_reg_select(texture->sampler_handle, in_reg_base);
-                printf("]");
+                fprintf(fp, "[");
+                print_texture_reg_select(fp, texture->sampler_handle, in_reg_base);
+                fprintf(fp, "]");
 
                 midg_stats.sampler_count = -16;
         } else {
-                printf("%u", texture->sampler_handle);
+                fprintf(fp, "%u", texture->sampler_handle);
                 update_stats(&midg_stats.sampler_count, texture->sampler_handle);
         }
 
-        print_swizzle_vec4(texture->swizzle, false, false);
-        printf(", %sr%d", texture->in_reg_full ? "" : "h", in_reg_base + texture->in_reg_select);
+        print_swizzle_vec4(fp, texture->swizzle, false, false, false);
+        fprintf(fp, ", %sr%u", texture->in_reg_full ? "" : "h", in_reg_base + texture->in_reg_select);
         assert(!(texture->in_reg_full && texture->in_reg_upper));
 
         /* TODO: integrate with swizzle */
         if (texture->in_reg_upper)
-                printf("'");
+                fprintf(fp, "'");
 
-        print_swizzle_vec4(texture->in_reg_swizzle, false, false);
+        print_swizzle_vec4(fp, texture->in_reg_swizzle, false, false, false);
 
         /* There is *always* an offset attached. Of
          * course, that offset is just immediate #0 for a
@@ -1365,76 +1500,62 @@ print_texture_word(uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned
          * fields become register triplets */
 
         if (texture->offset_register) {
-                printf(" + ");
+                fprintf(fp, " + ");
 
-                bool full = texture->offset_x & 1;
-                bool select = texture->offset_x & 2;
-                bool upper = texture->offset_x & 4;
+                bool full = texture->offset & 1;
+                bool select = texture->offset & 2;
+                bool upper = texture->offset & 4;
 
-                printf("%sr%d", full ? "" : "h", in_reg_base + select);
+                fprintf(fp, "%sr%u", full ? "" : "h", in_reg_base + select);
                 assert(!(texture->out_full && texture->out_upper));
 
                 /* TODO: integrate with swizzle */
                 if (upper)
-                        printf("'");
+                        fprintf(fp, "'");
 
-                /* The less questions you ask, the better. */
+                print_swizzle_vec4(fp, texture->offset >> 3, false, false, false);
 
-                unsigned swizzle_lo, swizzle_hi;
-                unsigned orig_y = texture->offset_y;
-                unsigned orig_z = texture->offset_z;
-
-                memcpy(&swizzle_lo, &orig_y, sizeof(unsigned));
-                memcpy(&swizzle_hi, &orig_z, sizeof(unsigned));
-
-                /* Duplicate hi swizzle over */
-                assert(swizzle_hi < 4);
-                swizzle_hi = (swizzle_hi << 2) | swizzle_hi;
-
-                unsigned swiz = (swizzle_lo << 4) | swizzle_hi;
-                unsigned reversed = util_bitreverse(swiz) >> 24;
-                print_swizzle_vec4(reversed, false, false);
-
-                printf(", ");
-        } else if (texture->offset_x || texture->offset_y || texture->offset_z) {
+                fprintf(fp, ", ");
+        } else if (texture->offset) {
                 /* Only select ops allow negative immediate offsets, verify */
 
-                bool neg_x = texture->offset_x < 0;
-                bool neg_y = texture->offset_y < 0;
-                bool neg_z = texture->offset_z < 0;
+                signed offset_x = (texture->offset & 0xF);
+                signed offset_y = ((texture->offset >> 4) & 0xF);
+                signed offset_z = ((texture->offset >> 8) & 0xF);
+
+                bool neg_x = offset_x < 0;
+                bool neg_y = offset_y < 0;
+                bool neg_z = offset_z < 0;
                 bool any_neg = neg_x || neg_y || neg_z;
 
                 if (any_neg && texture->op != TEXTURE_OP_TEXEL_FETCH)
-                        printf("/* invalid negative */ ");
+                        fprintf(fp, "/* invalid negative */ ");
 
                 /* Regardless, just print the immediate offset */
 
-                printf(" + <%d, %d, %d>, ",
-                       texture->offset_x,
-                       texture->offset_y,
-                       texture->offset_z);
+                fprintf(fp, " + <%d, %d, %d>, ", offset_x, offset_y, offset_z);
         } else {
-                printf(", ");
+                fprintf(fp, ", ");
         }
 
         char lod_operand = texture_op_takes_bias(texture->op) ? '+' : '=';
 
         if (texture->lod_register) {
-                printf("lod %c ", lod_operand);
-                print_texture_reg_select(texture->bias, in_reg_base);
-                printf(", ");
+                fprintf(fp, "lod %c ", lod_operand);
+                print_texture_reg_select(fp, texture->bias, in_reg_base);
+                fprintf(fp, ", ");
 
                 if (texture->bias_int)
-                        printf(" /* bias_int = 0x%X */", texture->bias_int);
+                        fprintf(fp, " /* bias_int = 0x%X */", texture->bias_int);
         } else if (texture->op == TEXTURE_OP_TEXEL_FETCH) {
                 /* For texel fetch, the int LOD is in the fractional place and
-                 * there is no fraction / possibility of bias. We *always* have
-                 * an explicit LOD, even if it's zero. */
+                 * there is no fraction. We *always* have an explicit LOD, even
+                 * if it's zero. */
 
                 if (texture->bias_int)
-                        printf(" /* bias_int = 0x%X */ ", texture->bias_int);
+                        fprintf(fp, " /* bias_int = 0x%X */ ", texture->bias_int);
 
-                printf("lod = %u, ", texture->bias);
+                fprintf(fp, "lod = %u, ", texture->bias);
         } else if (texture->bias || texture->bias_int) {
                 signed bias_int = texture->bias_int;
                 float bias_frac = texture->bias / 256.0f;
@@ -1444,33 +1565,31 @@ print_texture_word(uint32_t *word, unsigned tabs, unsigned in_reg_base, unsigned
                 char sign = (bias >= 0.0) ? '+' : '-';
                 char operand = is_bias ? sign : '=';
 
-                printf("lod %c %f, ", operand, fabsf(bias));
+                fprintf(fp, "lod %c %f, ", operand, fabsf(bias));
         }
 
-        printf("\n");
+        fprintf(fp, "\n");
 
         /* While not zero in general, for these simple instructions the
          * following unknowns are zero, so we don't include them */
 
         if (texture->unknown4 ||
-            texture->unknownA ||
             texture->unknown8) {
-                printf("// unknown4 = 0x%x\n", texture->unknown4);
-                printf("// unknownA = 0x%x\n", texture->unknownA);
-                printf("// unknown8 = 0x%x\n", texture->unknown8);
+                fprintf(fp, "// unknown4 = 0x%x\n", texture->unknown4);
+                fprintf(fp, "// unknown8 = 0x%x\n", texture->unknown8);
         }
 
         midg_stats.instruction_count++;
 }
 
 struct midgard_disasm_stats
-disassemble_midgard(uint8_t *code, size_t size, unsigned gpu_id, gl_shader_stage stage)
+disassemble_midgard(FILE *fp, uint8_t *code, size_t size, unsigned gpu_id, gl_shader_stage stage)
 {
         uint32_t *words = (uint32_t *) code;
         unsigned num_words = size / 4;
         int tabs = 0;
 
-        bool prefetch_flag = false;
+        bool branch_forward = false;
 
         int last_next_tag = -1;
 
@@ -1485,87 +1604,124 @@ disassemble_midgard(uint8_t *code, size_t size, unsigned gpu_id, gl_shader_stage
         while (i < num_words) {
                 unsigned tag = words[i] & 0xF;
                 unsigned next_tag = (words[i] >> 4) & 0xF;
-                unsigned num_quad_words = midgard_word_size[tag];
+                unsigned num_quad_words = midgard_tag_props[tag].size;
 
                 if (midg_tags[i] && midg_tags[i] != tag) {
-                        printf("\t/* XXX: TAG ERROR branch, got ");
-                        print_tag_short(tag);
-                        printf(" expected ");
-                        print_tag_short(midg_tags[i]);
-                        printf(" */\n");
+                        fprintf(fp, "\t/* XXX: TAG ERROR branch, got %s expected %s */\n",
+                                        midgard_tag_props[tag].name,
+                                        midgard_tag_props[midg_tags[i]].name);
                 }
 
                 midg_tags[i] = tag;
 
-                /* Check the tag */
-                if (last_next_tag > 1) {
-                        if (last_next_tag != tag) {
-                                printf("\t/* XXX: TAG ERROR sequence, got ");
-                                print_tag_short(tag);
-                                printf(" expected ");
-                                print_tag_short(last_next_tag);
-                                printf(" */\n");
-                        }
-                } else {
-                        /* TODO: Check ALU case */
+                /* Check the tag. The idea is to ensure that next_tag is
+                 * *always* recoverable from the disassembly, such that we may
+                 * safely omit printing next_tag. To show this, we first
+                 * consider that next tags are semantically off-byone -- we end
+                 * up parsing tag n during step n+1. So, we ensure after we're
+                 * done disassembling the next tag of the final bundle is BREAK
+                 * and warn otherwise. We also ensure that the next tag is
+                 * never INVALID. Beyond that, since the last tag is checked
+                 * outside the loop, we can check one tag prior. If equal to
+                 * the current tag (which is unique), we're done. Otherwise, we
+                 * print if that tag was > TAG_BREAK, which implies the tag was
+                 * not TAG_BREAK or TAG_INVALID. But we already checked for
+                 * TAG_INVALID, so it's just if the last tag was TAG_BREAK that
+                 * we're silent. So we throw in a print for break-next on at
+                 * the end of the bundle (if it's not the final bundle, which
+                 * we already check for above), disambiguating this case as
+                 * well.  Hence in all cases we are unambiguous, QED. */
+
+                if (next_tag == TAG_INVALID)
+                        fprintf(fp, "\t/* XXX: invalid next tag */\n");
+
+                if (last_next_tag > TAG_BREAK && last_next_tag != tag) {
+                        fprintf(fp, "\t/* XXX: TAG ERROR sequence, got %s expexted %s */\n",
+                                        midgard_tag_props[tag].name,
+                                        midgard_tag_props[last_next_tag].name);
                 }
 
                 last_next_tag = next_tag;
 
-                switch (midgard_word_types[tag]) {
-                case midgard_word_type_texture: {
-                        /* Texturing uses ldst/work space on T720 */
-                        bool has_texture_pipeline = gpu_id != 0x0720;
-                        print_texture_word(&words[i], tabs,
-                                        has_texture_pipeline ? REG_TEX_BASE : 0,
-                                        has_texture_pipeline ? REG_TEX_BASE : REGISTER_LDST_BASE);
+                /* Tags are unique in the following way:
+                 *
+                 * INVALID, BREAK, UNKNOWN_*: verbosely printed
+                 * TEXTURE_4_BARRIER: verified by barrier/!barrier op
+                 * TEXTURE_4_VTX: .vtx tag printed
+                 * TEXTURE_4: tetxure lack of barriers or .vtx
+                 * TAG_LOAD_STORE_4: only load/store
+                 * TAG_ALU_4/8/12/16: by number of instructions/constants
+                 * TAG_ALU_4_8/12/16_WRITEOUT: ^^ with .writeout tag
+                 */
+
+                switch (tag) {
+                case TAG_TEXTURE_4_VTX ... TAG_TEXTURE_4_BARRIER: {
+                        bool interpipe_aliasing =
+                                midgard_get_quirks(gpu_id) & MIDGARD_INTERPIPE_REG_ALIASING;
+
+                        print_texture_word(fp, &words[i], tabs,
+                                        interpipe_aliasing ? 0 : REG_TEX_BASE,
+                                        interpipe_aliasing ? REGISTER_LDST_BASE : REG_TEX_BASE);
                         break;
                 }
 
-                case midgard_word_type_load_store:
-                        print_load_store_word(&words[i], tabs);
+                case TAG_LOAD_STORE_4:
+                        print_load_store_word(fp, &words[i], tabs);
                         break;
 
-                case midgard_word_type_alu:
-                        print_alu_word(&words[i], num_quad_words, tabs, i + 4*num_quad_words);
+                case TAG_ALU_4 ... TAG_ALU_16_WRITEOUT:
+                        branch_forward = print_alu_word(fp, &words[i], num_quad_words, tabs, i + 4*num_quad_words);
 
                         /* Reset word static analysis state */
                         is_embedded_constant_half = false;
                         is_embedded_constant_int = false;
 
+                        /* TODO: infer/verify me */
+                        if (tag >= TAG_ALU_4_WRITEOUT)
+                                fprintf(fp, "writeout\n");
+
                         break;
 
                 default:
-                        printf("Unknown word type %u:\n", words[i] & 0xF);
+                        fprintf(fp, "Unknown word type %u:\n", words[i] & 0xF);
                         num_quad_words = 1;
-                        print_quad_word(&words[i], tabs);
-                        printf("\n");
+                        print_quad_word(fp, &words[i], tabs);
+                        fprintf(fp, "\n");
                         break;
                 }
 
-                if (prefetch_flag && midgard_word_types[tag] == midgard_word_type_alu)
-                        break;
-
-                printf("\n");
-
-                unsigned next = (words[i] & 0xF0) >> 4;
+                /* We are parsing per bundle anyway. Add before we start
+                 * breaking out so we don't miss the final bundle. */
 
-                /* We are parsing per bundle anyway */
                 midg_stats.bundle_count++;
                 midg_stats.quadword_count += num_quad_words;
 
-                /* Break based on instruction prefetch flag */
-
-                if (i < num_words && next == 1) {
-                        prefetch_flag = true;
+                /* Include a synthetic "break" instruction at the end of the
+                 * bundle to signify that if, absent a branch, the shader
+                 * execution will stop here. Stop disassembly at such a break
+                 * based on a heuristic */
 
-                        if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu)
+                if (next_tag == TAG_BREAK) {
+                        if (branch_forward) {
+                                fprintf(fp, "break\n");
+                        } else {
+                                fprintf(fp, "\n");
                                 break;
+                        }
                 }
 
+                fprintf(fp, "\n");
+
                 i += 4 * num_quad_words;
         }
 
+        if (last_next_tag != TAG_BREAK) {
+                fprintf(fp, "/* XXX: shader ended with tag %s */\n",
+                                midgard_tag_props[last_next_tag].name);
+        }
+
+        free(midg_tags);
+
         /* We computed work_count as max_work_registers, so add one to get the
          * count. If no work registers are written, you still have one work
          * reported, which is exactly what the hardware expects */