freedreno: small fix for flushing dependent batches
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3.c
index 41112460155619e040d618ca4a88d29778f1dabb..dd5fb2fbbe5ae483d5cb69f7601233883cf80d03 100644 (file)
 #include <stdbool.h>
 #include <errno.h>
 
+#include "util/ralloc.h"
+
 #include "freedreno_util.h"
 #include "instr-a3xx.h"
 
-#define CHUNK_SZ 1020
-
-struct ir3_heap_chunk {
-       struct ir3_heap_chunk *next;
-       uint32_t heap[CHUNK_SZ];
-};
-
-static void grow_heap(struct ir3 *shader)
-{
-       struct ir3_heap_chunk *chunk = calloc(1, sizeof(*chunk));
-       chunk->next = shader->chunk;
-       shader->chunk = chunk;
-       shader->heap_idx = 0;
-}
-
 /* simple allocator to carve allocations out of an up-front allocated heap,
  * so that we can free everything easily in one shot.
  */
 void * ir3_alloc(struct ir3 *shader, int sz)
 {
-       void *ptr;
+       return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
+}
 
-       sz = align(sz, 4) / 4;
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+               unsigned nin, unsigned nout)
+{
+       struct ir3 *shader = rzalloc(compiler, struct ir3);
 
-       if ((shader->heap_idx + sz) > CHUNK_SZ)
-               grow_heap(shader);
+       shader->compiler = compiler;
+       shader->ninputs = nin;
+       shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
 
-       ptr = &shader->chunk->heap[shader->heap_idx];
-       shader->heap_idx += sz;
+       shader->noutputs = nout;
+       shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
 
-       return ptr;
-}
+       list_inithead(&shader->block_list);
+       list_inithead(&shader->array_list);
 
-struct ir3 * ir3_create(void)
-{
-       struct ir3 *shader =
-                       calloc(1, sizeof(struct ir3));
-       grow_heap(shader);
        return shader;
 }
 
 void ir3_destroy(struct ir3 *shader)
 {
-       while (shader->chunk) {
-               struct ir3_heap_chunk *chunk = shader->chunk;
-               shader->chunk = chunk->next;
-               free(chunk);
-       }
-       free(shader->instrs);
-       free(shader->baryfs);
-       free(shader);
+       ralloc_free(shader);
 }
 
 #define iassert(cond) do { \
@@ -97,7 +77,10 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 {
        reg_t val = { .dummy32 = 0 };
 
-       assert(!(reg->flags & ~valid_flags));
+       if (reg->flags & ~valid_flags) {
+               debug_printf("INVALID FLAGS: %x vs %x\n",
+                               reg->flags, valid_flags);
+       }
 
        if (!(reg->flags & IR3_REG_R))
                repeat = 0;
@@ -105,15 +88,25 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
        if (reg->flags & IR3_REG_IMMED) {
                val.iim_val = reg->iim_val;
        } else {
-               int8_t components = util_last_bit(reg->wrmask);
-               int16_t max = (reg->num + repeat + components - 1) >> 2;
+               unsigned components;
+               int16_t max;
 
-               val.comp = reg->num & 0x3;
-               val.num  = reg->num >> 2;
+               if (reg->flags & IR3_REG_RELATIV) {
+                       components = reg->size;
+                       val.idummy10 = reg->array.offset;
+                       max = (reg->array.offset + repeat + components - 1) >> 2;
+               } else {
+                       components = util_last_bit(reg->wrmask);
+                       val.comp = reg->num & 0x3;
+                       val.num  = reg->num >> 2;
+                       max = (reg->num + repeat + components - 1) >> 2;
+               }
 
                if (reg->flags & IR3_REG_CONST) {
                        info->max_const = MAX2(info->max_const, max);
-               } else if ((max != REG_A0) && (max != REG_P0)) {
+               } else if (val.num == 63) {
+                       /* ignore writes to dummy register r63.x */
+               } else if (max < 48) {
                        if (reg->flags & IR3_REG_HALF) {
                                info->max_half_reg = MAX2(info->max_half_reg, max);
                        } else {
@@ -130,7 +123,13 @@ static int emit_cat0(struct ir3_instruction *instr, void *ptr,
 {
        instr_cat0_t *cat0 = ptr;
 
-       cat0->immed    = instr->cat0.immed;
+       if (info->gpu_id >= 500) {
+               cat0->a5xx.immed = instr->cat0.immed;
+       } else if (info->gpu_id >= 400) {
+               cat0->a4xx.immed = instr->cat0.immed;
+       } else {
+               cat0->a3xx.immed = instr->cat0.immed;
+       }
        cat0->repeat   = instr->repeat;
        cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
        cat0->inv      = instr->cat0.inv;
@@ -164,13 +163,13 @@ static int emit_cat1(struct ir3_instruction *instr, void *ptr,
                cat1->iim_val = src->iim_val;
                cat1->src_im  = 1;
        } else if (src->flags & IR3_REG_RELATIV) {
-               cat1->off       = src->offset;
+               cat1->off       = reg(src, info, instr->repeat,
+                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF | IR3_REG_RELATIV);
                cat1->src_rel   = 1;
                cat1->src_rel_c = !!(src->flags & IR3_REG_CONST);
        } else {
                cat1->src  = reg(src, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_R |
-                               IR3_REG_CONST | IR3_REG_HALF);
+                               IR3_REG_R | IR3_REG_CONST | IR3_REG_HALF);
                cat1->src_c     = !!(src->flags & IR3_REG_CONST);
        }
 
@@ -200,31 +199,31 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
        struct ir3_register *src1 = instr->regs[1];
        struct ir3_register *src2 = instr->regs[2];
        instr_cat2_t *cat2 = ptr;
+       unsigned absneg = ir3_cat2_absneg(instr->opc);
 
        iassert((instr->regs_count == 2) || (instr->regs_count == 3));
 
        if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->num < (1 << 10));
+               iassert(src1->array.offset < (1 << 10));
                cat2->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
                cat2->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
                cat2->rel1.src1_rel  = 1;
        } else if (src1->flags & IR3_REG_CONST) {
                iassert(src1->num < (1 << 12));
                cat2->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
                cat2->c1.src1_c = 1;
        } else {
                iassert(src1->num < (1 << 11));
                cat2->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
-                               IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+                               absneg);
        }
        cat2->src1_im  = !!(src1->flags & IR3_REG_IMMED);
-       cat2->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
-       cat2->src1_abs = !!(src1->flags & IR3_REG_ABS);
+       cat2->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+       cat2->src1_abs = !!(src1->flags & (IR3_REG_FABS | IR3_REG_SABS));
        cat2->src1_r   = !!(src1->flags & IR3_REG_R);
 
        if (src2) {
@@ -232,34 +231,34 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
                                !((src1->flags ^ src2->flags) & IR3_REG_HALF));
 
                if (src2->flags & IR3_REG_RELATIV) {
-                       iassert(src2->num < (1 << 10));
+                       iassert(src2->array.offset < (1 << 10));
                        cat2->rel2.src2      = reg(src2, info, instr->repeat,
-                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                                       IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+                                       IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                                       IR3_REG_HALF | absneg);
                        cat2->rel2.src2_c    = !!(src2->flags & IR3_REG_CONST);
                        cat2->rel2.src2_rel  = 1;
                } else if (src2->flags & IR3_REG_CONST) {
                        iassert(src2->num < (1 << 12));
                        cat2->c2.src2   = reg(src2, info, instr->repeat,
-                                       IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
-                                       IR3_REG_R | IR3_REG_HALF);
+                                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
                        cat2->c2.src2_c = 1;
                } else {
                        iassert(src2->num < (1 << 11));
                        cat2->src2 = reg(src2, info, instr->repeat,
-                                       IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
-                                       IR3_REG_R | IR3_REG_HALF);
+                                       IR3_REG_IMMED | IR3_REG_R | IR3_REG_HALF |
+                                       absneg);
                }
 
                cat2->src2_im  = !!(src2->flags & IR3_REG_IMMED);
-               cat2->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
-               cat2->src2_abs = !!(src2->flags & IR3_REG_ABS);
+               cat2->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
+               cat2->src2_abs = !!(src2->flags & (IR3_REG_FABS | IR3_REG_SABS));
                cat2->src2_r   = !!(src2->flags & IR3_REG_R);
        }
 
        cat2->dst      = reg(dst, info, instr->repeat,
                        IR3_REG_R | IR3_REG_EI | IR3_REG_HALF);
        cat2->repeat   = instr->repeat;
+       cat2->sat      = !!(instr->flags & IR3_INSTR_SAT);
        cat2->ss       = !!(instr->flags & IR3_INSTR_SS);
        cat2->ul       = !!(instr->flags & IR3_INSTR_UL);
        cat2->dst_half = !!((src1->flags ^ dst->flags) & IR3_REG_HALF);
@@ -281,6 +280,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
        struct ir3_register *src1 = instr->regs[1];
        struct ir3_register *src2 = instr->regs[2];
        struct ir3_register *src3 = instr->regs[3];
+       unsigned absneg = ir3_cat3_absneg(instr->opc);
        instr_cat3_t *cat3 = ptr;
        uint32_t src_flags = 0;
 
@@ -305,59 +305,57 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
        iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
 
        if (src1->flags & IR3_REG_RELATIV) {
-               iassert(src1->num < (1 << 10));
+               iassert(src1->array.offset < (1 << 10));
                cat3->rel1.src1      = reg(src1, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
                cat3->rel1.src1_c    = !!(src1->flags & IR3_REG_CONST);
                cat3->rel1.src1_rel  = 1;
        } else if (src1->flags & IR3_REG_CONST) {
                iassert(src1->num < (1 << 12));
                cat3->c1.src1   = reg(src1, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
-                               IR3_REG_HALF);
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
                cat3->c1.src1_c = 1;
        } else {
                iassert(src1->num < (1 << 11));
                cat3->src1 = reg(src1, info, instr->repeat,
-                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_R | IR3_REG_HALF | absneg);
        }
 
-       cat3->src1_neg = !!(src1->flags & IR3_REG_NEGATE);
+       cat3->src1_neg = !!(src1->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
        cat3->src1_r   = !!(src1->flags & IR3_REG_R);
 
        cat3->src2     = reg(src2, info, instr->repeat,
-                       IR3_REG_CONST | IR3_REG_NEGATE |
-                       IR3_REG_R | IR3_REG_HALF);
+                       IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF | absneg);
        cat3->src2_c   = !!(src2->flags & IR3_REG_CONST);
-       cat3->src2_neg = !!(src2->flags & IR3_REG_NEGATE);
+       cat3->src2_neg = !!(src2->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
        cat3->src2_r   = !!(src2->flags & IR3_REG_R);
 
 
        if (src3->flags & IR3_REG_RELATIV) {
-               iassert(src3->num < (1 << 10));
+               iassert(src3->array.offset < (1 << 10));
                cat3->rel2.src3      = reg(src3, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
+                               IR3_REG_HALF | absneg);
                cat3->rel2.src3_c    = !!(src3->flags & IR3_REG_CONST);
                cat3->rel2.src3_rel  = 1;
        } else if (src3->flags & IR3_REG_CONST) {
                iassert(src3->num < (1 << 12));
                cat3->c2.src3   = reg(src3, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_R |
-                               IR3_REG_HALF);
+                               IR3_REG_CONST | IR3_REG_R | IR3_REG_HALF);
                cat3->c2.src3_c = 1;
        } else {
                iassert(src3->num < (1 << 11));
                cat3->src3 = reg(src3, info, instr->repeat,
-                               IR3_REG_NEGATE | IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_R | IR3_REG_HALF | absneg);
        }
 
-       cat3->src3_neg = !!(src3->flags & IR3_REG_NEGATE);
+       cat3->src3_neg = !!(src3->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT));
        cat3->src3_r   = !!(src3->flags & IR3_REG_R);
 
        cat3->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
        cat3->repeat   = instr->repeat;
+       cat3->sat      = !!(instr->flags & IR3_INSTR_SAT);
        cat3->ss       = !!(instr->flags & IR3_INSTR_SS);
        cat3->ul       = !!(instr->flags & IR3_INSTR_UL);
        cat3->dst_half = !!((src_flags ^ dst->flags) & IR3_REG_HALF);
@@ -379,32 +377,33 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr,
        iassert(instr->regs_count == 2);
 
        if (src->flags & IR3_REG_RELATIV) {
-               iassert(src->num < (1 << 10));
+               iassert(src->array.offset < (1 << 10));
                cat4->rel.src      = reg(src, info, instr->repeat,
-                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_NEGATE |
-                               IR3_REG_ABS | IR3_REG_R | IR3_REG_HALF);
+                               IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
+                               IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
                cat4->rel.src_c    = !!(src->flags & IR3_REG_CONST);
                cat4->rel.src_rel  = 1;
        } else if (src->flags & IR3_REG_CONST) {
                iassert(src->num < (1 << 12));
                cat4->c.src   = reg(src, info, instr->repeat,
-                               IR3_REG_CONST | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_CONST | IR3_REG_FNEG | IR3_REG_FABS |
                                IR3_REG_R | IR3_REG_HALF);
                cat4->c.src_c = 1;
        } else {
                iassert(src->num < (1 << 11));
                cat4->src = reg(src, info, instr->repeat,
-                               IR3_REG_IMMED | IR3_REG_NEGATE | IR3_REG_ABS |
+                               IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
                                IR3_REG_R | IR3_REG_HALF);
        }
 
        cat4->src_im   = !!(src->flags & IR3_REG_IMMED);
-       cat4->src_neg  = !!(src->flags & IR3_REG_NEGATE);
-       cat4->src_abs  = !!(src->flags & IR3_REG_ABS);
+       cat4->src_neg  = !!(src->flags & IR3_REG_FNEG);
+       cat4->src_abs  = !!(src->flags & IR3_REG_FABS);
        cat4->src_r    = !!(src->flags & IR3_REG_R);
 
        cat4->dst      = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
        cat4->repeat   = instr->repeat;
+       cat4->sat      = !!(instr->flags & IR3_INSTR_SAT);
        cat4->ss       = !!(instr->flags & IR3_INSTR_SS);
        cat4->ul       = !!(instr->flags & IR3_INSTR_UL);
        cat4->dst_half = !!((src->flags ^ dst->flags) & IR3_REG_HALF);
@@ -428,12 +427,14 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr,
 
        iassert(!((dst->flags ^ type_flags(instr->cat5.type)) & IR3_REG_HALF));
 
+       assume(src1 || !src2);
+       assume(src2 || !src3);
+
        if (src1) {
                cat5->full = ! (src1->flags & IR3_REG_HALF);
                cat5->src1 = reg(src1, info, instr->repeat, IR3_REG_HALF);
        }
 
-
        if (instr->flags & IR3_INSTR_S2EN) {
                if (src2) {
                        iassert(!((src1->flags ^ src2->flags) & IR3_REG_HALF));
@@ -474,63 +475,196 @@ static int emit_cat5(struct ir3_instruction *instr, void *ptr,
 static int emit_cat6(struct ir3_instruction *instr, void *ptr,
                struct ir3_info *info)
 {
-       struct ir3_register *dst = instr->regs[0];
-       struct ir3_register *src = instr->regs[1];
+       struct ir3_register *dst, *src1, *src2;
        instr_cat6_t *cat6 = ptr;
 
-       iassert(instr->regs_count == 2);
+       cat6->type     = instr->cat6.type;
+       cat6->opc      = instr->opc;
+       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
+       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
+       cat6->g        = !!(instr->flags & IR3_INSTR_G);
+       cat6->opc_cat  = 6;
 
-       switch (instr->opc) {
-       /* load instructions: */
-       case OPC_LDG:
-       case OPC_LDP:
-       case OPC_LDL:
-       case OPC_LDLW:
-       case OPC_LDLV:
-       case OPC_PREFETCH: {
-               instr_cat6a_t *cat6a = ptr;
+       /* the "dst" for a store instruction is (from the perspective
+        * of data flow in the shader, ie. register use/def, etc) in
+        * fact a register that is read by the instruction, rather
+        * than written:
+        */
+       if (is_store(instr)) {
+               iassert(instr->regs_count >= 3);
 
-               iassert(!((dst->flags ^ type_flags(instr->cat6.type)) & IR3_REG_HALF));
+               dst  = instr->regs[1];
+               src1 = instr->regs[2];
+               src2 = (instr->regs_count >= 4) ? instr->regs[3] : NULL;
+       } else {
+               iassert(instr->regs_count >= 2);
 
-               cat6a->must_be_one1  = 1;
-               cat6a->must_be_one2  = 1;
-               cat6a->off = instr->cat6.offset;
-               cat6a->src = reg(src, info, instr->repeat, 0);
-               cat6a->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
-               break;
+               dst  = instr->regs[0];
+               src1 = instr->regs[1];
+               src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
        }
-       /* store instructions: */
-       case OPC_STG:
-       case OPC_STP:
-       case OPC_STL:
-       case OPC_STLW:
-       case OPC_STI: {
-               instr_cat6b_t *cat6b = ptr;
-               uint32_t src_flags = type_flags(instr->cat6.type);
-               uint32_t dst_flags = (instr->opc == OPC_STI) ? IR3_REG_HALF : 0;
 
-               iassert(!((src->flags ^ src_flags) & IR3_REG_HALF));
+       /* TODO we need a more comprehensive list about which instructions
+        * can be encoded which way.  Or possibly use IR3_INSTR_0 flag to
+        * indicate to use the src_off encoding even if offset is zero
+        * (but then what to do about dst_off?)
+        */
+       if (is_atomic(instr->opc)) {
+               instr_cat6ldgb_t *ldgb = ptr;
 
-               cat6b->must_be_one1  = 1;
-               cat6b->must_be_one2  = 1;
-               cat6b->src    = reg(src, info, instr->repeat, src_flags);
-               cat6b->off_hi = instr->cat6.offset >> 8;
-               cat6b->off    = instr->cat6.offset;
-               cat6b->dst    = reg(dst, info, instr->repeat, IR3_REG_R | dst_flags);
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = false;
 
-               break;
+               ldgb->d = instr->cat6.d - 1;
+               ldgb->typed = instr->cat6.typed;
+               ldgb->type_size = instr->cat6.iim_val - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               if (ldgb->g) {
+                       struct ir3_register *src3 = instr->regs[3];
+                       struct ir3_register *src4 = instr->regs[4];
+
+                       /* first src is src_ssbo: */
+                       iassert(src1->flags & IR3_REG_IMMED);
+                       ldgb->src_ssbo = src1->uim_val;
+
+                       ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+                       ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+                       ldgb->src3 = reg(src4, info, instr->repeat, 0);
+                       ldgb->pad0 = 0x1;
+                       ldgb->pad3 = 0x1;
+               } else {
+                       ldgb->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src1_im = !!(src1->flags & IR3_REG_IMMED);
+                       ldgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       ldgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+                       ldgb->pad0 = 0x1;
+                       ldgb->pad3 = 0x0;
+               }
+
+               return 0;
+       } else if (instr->opc == OPC_LDGB) {
+               struct ir3_register *src3 = instr->regs[3];
+               instr_cat6ldgb_t *ldgb = ptr;
+
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = false;
+
+               ldgb->d = instr->cat6.d - 1;
+               ldgb->typed = instr->cat6.typed;
+               ldgb->type_size = instr->cat6.iim_val - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               /* first src is src_ssbo: */
+               iassert(src1->flags & IR3_REG_IMMED);
+               ldgb->src_ssbo = src1->uim_val;
+
+               /* then next two are src1/src2: */
+               ldgb->src1 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+               ldgb->src1_im = !!(src2->flags & IR3_REG_IMMED);
+               ldgb->src2 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+               ldgb->src2_im = !!(src3->flags & IR3_REG_IMMED);
+
+               ldgb->pad0 = 0x0;
+               ldgb->pad3 = 0x1;
+
+               return 0;
+       } else if (instr->opc == OPC_RESINFO) {
+               instr_cat6ldgb_t *ldgb = ptr;
+
+               ldgb->d = instr->cat6.d - 1;
+
+               ldgb->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+
+               /* first src is src_ssbo: */
+               iassert(src1->flags & IR3_REG_IMMED);
+               ldgb->src_ssbo = src1->uim_val;
+
+               return 0;
+       } else if ((instr->opc == OPC_STGB) || (instr->opc == OPC_STIB)) {
+               struct ir3_register *src3 = instr->regs[4];
+               instr_cat6stgb_t *stgb = ptr;
+
+               /* maybe these two bits both determine the instruction encoding? */
+               cat6->src_off = true;
+               stgb->pad3 = 0x2;
+
+               stgb->d = instr->cat6.d - 1;
+               stgb->typed = instr->cat6.typed;
+               stgb->type_size = instr->cat6.iim_val - 1;
+
+               /* first src is dst_ssbo: */
+               iassert(dst->flags & IR3_REG_IMMED);
+               stgb->dst_ssbo = dst->uim_val;
+
+               /* then src1/src2/src3: */
+               stgb->src1 = reg(src1, info, instr->repeat, 0);
+               stgb->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+               stgb->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               stgb->src3 = reg(src3, info, instr->repeat, IR3_REG_IMMED);
+               stgb->src3_im = !!(src3->flags & IR3_REG_IMMED);
+
+               return 0;
+       } else if (instr->cat6.src_offset || (instr->opc == OPC_LDG) ||
+                       (instr->opc == OPC_LDL)) {
+               instr_cat6a_t *cat6a = ptr;
+
+               cat6->src_off = true;
+
+               cat6a->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+               cat6a->src1_im = !!(src1->flags & IR3_REG_IMMED);
+               if (src2) {
+                       cat6a->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       cat6a->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               }
+               cat6a->off = instr->cat6.src_offset;
+       } else {
+               instr_cat6b_t *cat6b = ptr;
+
+               cat6->src_off = false;
+
+               cat6b->src1 = reg(src1, info, instr->repeat, IR3_REG_IMMED);
+               cat6b->src1_im = !!(src1->flags & IR3_REG_IMMED);
+               if (src2) {
+                       cat6b->src2 = reg(src2, info, instr->repeat, IR3_REG_IMMED);
+                       cat6b->src2_im = !!(src2->flags & IR3_REG_IMMED);
+               }
        }
-       default:
-               // TODO
-               break;
+
+       if (instr->cat6.dst_offset || (instr->opc == OPC_STG) ||
+                       (instr->opc == OPC_STL)) {
+               instr_cat6c_t *cat6c = ptr;
+               cat6->dst_off = true;
+               cat6c->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
+               cat6c->off = instr->cat6.dst_offset;
+       } else {
+               instr_cat6d_t *cat6d = ptr;
+               cat6->dst_off = false;
+               cat6d->dst = reg(dst, info, instr->repeat, IR3_REG_R | IR3_REG_HALF);
        }
 
-       cat6->iim_val  = instr->cat6.iim_val;
-       cat6->type     = instr->cat6.type;
-       cat6->opc      = instr->opc;
-       cat6->jmp_tgt  = !!(instr->flags & IR3_INSTR_JP);
-       cat6->sync     = !!(instr->flags & IR3_INSTR_SY);
-       cat6->opc_cat  = 6;
+       return 0;
+}
+
+static int emit_cat7(struct ir3_instruction *instr, void *ptr,
+               struct ir3_info *info)
+{
+       instr_cat7_t *cat7 = ptr;
+
+       cat7->ss      = !!(instr->flags & IR3_INSTR_SS);
+       cat7->w       = instr->cat7.w;
+       cat7->r       = instr->cat7.r;
+       cat7->l       = instr->cat7.l;
+       cat7->g       = instr->cat7.g;
+       cat7->opc     = instr->opc;
+       cat7->jmp_tgt = !!(instr->flags & IR3_INSTR_JP);
+       cat7->sync    = !!(instr->flags & IR3_INSTR_SY);
+       cat7->opc_cat = 7;
 
        return 0;
 }
@@ -538,38 +672,54 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
 static int (*emit[])(struct ir3_instruction *instr, void *ptr,
                struct ir3_info *info) = {
        emit_cat0, emit_cat1, emit_cat2, emit_cat3, emit_cat4, emit_cat5, emit_cat6,
+       emit_cat7,
 };
 
 void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
                uint32_t gpu_id)
 {
        uint32_t *ptr, *dwords;
-       uint32_t i;
 
+       info->gpu_id        = gpu_id;
        info->max_reg       = -1;
        info->max_half_reg  = -1;
        info->max_const     = -1;
        info->instrs_count  = 0;
+       info->sizedwords    = 0;
+       info->ss = info->sy = 0;
+
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       info->sizedwords += 2;
+               }
+       }
 
-       /* need a integer number of instruction "groups" (sets of 16
+       /* need an integer number of instruction "groups" (sets of 16
         * instructions on a4xx or sets of 4 instructions on a3xx),
         * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
         */
        if (gpu_id >= 400) {
-               info->sizedwords = 2 * align(shader->instrs_count, 16);
+               info->sizedwords = align(info->sizedwords, 16 * 2);
        } else {
-               info->sizedwords = 2 * align(shader->instrs_count, 4);
+               info->sizedwords = align(info->sizedwords, 4 * 2);
        }
 
        ptr = dwords = calloc(4, info->sizedwords);
 
-       for (i = 0; i < shader->instrs_count; i++) {
-               struct ir3_instruction *instr = shader->instrs[i];
-               int ret = emit[instr->category](instr, dwords, info);
-               if (ret)
-                       goto fail;
-               info->instrs_count += 1 + instr->repeat;
-               dwords += 2;
+       list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       int ret = emit[opc_cat(instr->opc)](instr, dwords, info);
+                       if (ret)
+                               goto fail;
+                       info->instrs_count += 1 + instr->repeat;
+                       dwords += 2;
+
+                       if (instr->flags & IR3_INSTR_SS)
+                               info->ss++;
+
+                       if (instr->flags & IR3_INSTR_SY)
+                               info->sy++;
+               }
        }
 
        return ptr;
@@ -590,84 +740,78 @@ static struct ir3_register * reg_create(struct ir3 *shader,
        return reg;
 }
 
-static void insert_instr(struct ir3 *shader,
+static void insert_instr(struct ir3_block *block,
                struct ir3_instruction *instr)
 {
+       struct ir3 *shader = block->shader;
 #ifdef DEBUG
-       static uint32_t serialno = 0;
-       instr->serialno = ++serialno;
+       instr->serialno = ++shader->instr_count;
 #endif
-       if (shader->instrs_count == shader->instrs_sz) {
-               shader->instrs_sz = MAX2(2 * shader->instrs_sz, 16);
-               shader->instrs = realloc(shader->instrs,
-                               shader->instrs_sz * sizeof(shader->instrs[0]));
-       }
-       shader->instrs[shader->instrs_count++] = instr;
+       list_addtail(&instr->node, &block->instr_list);
 
-       if (is_input(instr)) {
-               if (shader->baryfs_count == shader->baryfs_sz) {
-                       shader->baryfs_sz = MAX2(2 * shader->baryfs_sz, 16);
-                       shader->baryfs = realloc(shader->baryfs,
-                                       shader->baryfs_sz * sizeof(shader->baryfs[0]));
-               }
-               shader->baryfs[shader->baryfs_count++] = instr;
-       }
+       if (is_input(instr))
+               array_insert(shader, shader->baryfs, instr);
 }
 
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-               unsigned ntmp, unsigned nin, unsigned nout)
+struct ir3_block * ir3_block_create(struct ir3 *shader)
 {
-       struct ir3_block *block;
-       unsigned size;
-       char *ptr;
-
-       size = sizeof(*block);
-       size += sizeof(block->temporaries[0]) * ntmp;
-       size += sizeof(block->inputs[0]) * nin;
-       size += sizeof(block->outputs[0]) * nout;
-
-       ptr = ir3_alloc(shader, size);
-
-       block = (void *)ptr;
-       ptr += sizeof(*block);
-
-       block->temporaries = (void *)ptr;
-       block->ntemporaries = ntmp;
-       ptr += sizeof(block->temporaries[0]) * ntmp;
+       struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+       block->serialno = ++shader->block_count;
+#endif
+       block->shader = shader;
+       list_inithead(&block->node);
+       list_inithead(&block->instr_list);
+       return block;
+}
 
-       block->inputs = (void *)ptr;
-       block->ninputs = nin;
-       ptr += sizeof(block->inputs[0]) * nin;
+static struct ir3_instruction *instr_create(struct ir3_block *block, int nreg)
+{
+       struct ir3_instruction *instr;
+       unsigned sz = sizeof(*instr) + (nreg * sizeof(instr->regs[0]));
+       char *ptr = ir3_alloc(block->shader, sz);
 
-       block->outputs = (void *)ptr;
-       block->noutputs = nout;
-       ptr += sizeof(block->outputs[0]) * nout;
+       instr = (struct ir3_instruction *)ptr;
+       ptr  += sizeof(*instr);
+       instr->regs = (struct ir3_register **)ptr;
 
-       block->shader = shader;
+#ifdef DEBUG
+       instr->regs_max = nreg;
+#endif
 
-       return block;
+       return instr;
 }
 
-struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
-               int category, opc_t opc)
+struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
+               opc_t opc, int nreg)
 {
-       struct ir3_instruction *instr =
-                       ir3_alloc(block->shader, sizeof(struct ir3_instruction));
+       struct ir3_instruction *instr = instr_create(block, nreg);
        instr->block = block;
-       instr->category = category;
        instr->opc = opc;
-       insert_instr(block->shader, instr);
+       insert_instr(block, instr);
        return instr;
 }
 
+struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc)
+{
+       /* NOTE: we could be slightly more clever, at least for non-meta,
+        * and choose # of regs based on category.
+        */
+       return ir3_instr_create2(block, opc, 4);
+}
+
 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 {
-       struct ir3_instruction *new_instr =
-                       ir3_alloc(instr->block->shader, sizeof(struct ir3_instruction));
+       struct ir3_instruction *new_instr = instr_create(instr->block,
+                       instr->regs_count);
+       struct ir3_register **regs;
        unsigned i;
 
+       regs = new_instr->regs;
        *new_instr = *instr;
-       insert_instr(instr->block->shader, new_instr);
+       new_instr->regs = regs;
+
+       insert_instr(instr->block, new_instr);
 
        /* clone registers: */
        new_instr->regs_count = 0;
@@ -681,11 +825,78 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
        return new_instr;
 }
 
+/* Add a false dependency to instruction, to ensure it is scheduled first: */
+void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
+{
+       array_insert(instr, instr->deps, dep);
+}
+
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
                int num, int flags)
 {
-       struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
-       assert(instr->regs_count < ARRAY_SIZE(instr->regs));
+       struct ir3 *shader = instr->block->shader;
+       struct ir3_register *reg = reg_create(shader, num, flags);
+#ifdef DEBUG
+       debug_assert(instr->regs_count < instr->regs_max);
+#endif
        instr->regs[instr->regs_count++] = reg;
        return reg;
 }
+
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+               struct ir3_register *reg)
+{
+       struct ir3_register *new_reg = reg_create(shader, 0, 0);
+       *new_reg = *reg;
+       return new_reg;
+}
+
+void
+ir3_instr_set_address(struct ir3_instruction *instr,
+               struct ir3_instruction *addr)
+{
+       if (instr->address != addr) {
+               struct ir3 *ir = instr->block->shader;
+               instr->address = addr;
+               array_insert(ir, ir->indirects, instr);
+       }
+}
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+       list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+               instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               ir3_block_clear_mark(block);
+       }
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+unsigned
+ir3_count_instructions(struct ir3 *ir)
+{
+       unsigned cnt = 0;
+       list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+               list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+                       instr->ip = cnt++;
+               }
+               block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+               block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+       }
+       return cnt;
+}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+       list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+               if (arr->id == id)
+                       return arr;
+       return NULL;
+}