freedreno: a2xx: ir2 update
authorJonathan Marek <jonathan@marek.ca>
Tue, 24 Jul 2018 12:58:24 +0000 (08:58 -0400)
committerRob Clark <robdclark@gmail.com>
Tue, 14 Aug 2018 16:46:25 +0000 (12:46 -0400)
this patch brings a number of changes to ir2:
-ir2 now generates CF clauses as necessary during assembly. this simplifies
 fd2_program/fd2_compiler and is necessary to implement optimization passes
-ir2 now has separate vector/scalar instructions. this will make it easier
 to implementing scheduling of scalar+vector instructions together. dst_reg
 is also now seperate from src registers instead of a single list
-ir2 now implements register allocation. this makes it possible to compile
 shaders which have more than 64 TGSI registers
-ir2 now implements the following optimizations: removal of IN/OUT MOV
 instructions generated by TGSI and removal of unused instructions when
 some exports are disabled
-ir2 now allows full 8-bit index for constants
-ir2_alloc no longer allocates 4 times too many bytes

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
src/gallium/drivers/freedreno/a2xx/fd2_program.c
src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
src/gallium/drivers/freedreno/a2xx/ir-a2xx.h

index 3ad47f98508ddbb880e9c81562464cd5495df33b..12f9a1ce0ab8dbf41a68cd4e72d1526e964e2e06 100644 (file)
@@ -93,9 +93,6 @@ struct fd2_compile_context {
        unsigned position, psize;
 
        uint64_t need_sync;
-
-       /* current exec CF instruction */
-       struct ir2_cf *cf;
 };
 
 static int
@@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
 
        ctx->prog = prog;
        ctx->so = so;
-       ctx->cf = NULL;
        ctx->pred_depth = 0;
 
        ret = tgsi_parse_init(&ctx->parser, so->tokens);
@@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx)
        tgsi_parse_free(&ctx->parser);
 }
 
-static struct ir2_cf *
-next_exec_cf(struct fd2_compile_context *ctx)
-{
-       struct ir2_cf *cf = ctx->cf;
-       if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
-               ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
-       return cf;
-}
-
 static void
 compile_vtx_fetch(struct fd2_compile_context *ctx)
 {
@@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
        int i;
        for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
                struct ir2_instruction *instr = ir2_instr_create(
-                               next_exec_cf(ctx), IR2_FETCH);
+                               ctx->so->ir, IR2_FETCH);
                instr->fetch.opc = VTX_FETCH;
 
                ctx->need_sync |= 1 << (i+1);
 
-               ir2_reg_create(instr, i+1, "xyzw", 0);
-               ir2_reg_create(instr, 0, "x", 0);
+               ir2_dst_create(instr, i+1, "xyzw", 0);
+               ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
 
                if (i == 0)
                        instr->sync = true;
@@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
                vfetch_instrs[i] = instr;
        }
        ctx->so->num_vfetch_instrs = i;
-       ctx->cf = NULL;
 }
 
 /*
@@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx)
        return num;
 }
 
-static struct ir2_register *
+static struct ir2_dst_register *
 add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
                const struct tgsi_dst_register *dst)
 {
@@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
        swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
        swiz[4] = '\0';
 
-       return ir2_reg_create(alu, num, swiz, flags);
+       return ir2_dst_create(alu, num, swiz, flags);
 }
 
-static struct ir2_register *
+static struct ir2_src_register *
 add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
                const struct tgsi_src_register *src)
 {
@@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
                if (ctx->type == PIPE_SHADER_VERTEX) {
                        num = src->Index + 1;
                } else {
+                       flags |= IR2_REG_INPUT;
                        num = export_linkage(ctx,
                                        ctx->input_export_idx[src->Index]);
                }
@@ -415,7 +402,7 @@ static void
 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
        if (inst->Instruction.Saturate) {
-               alu->alu.vector_clamp = true;
+               alu->alu_vector.clamp = true;
        }
 }
 
@@ -423,7 +410,7 @@ static void
 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
        if (inst->Instruction.Saturate) {
-               alu->alu.scalar_clamp = true;
+               alu->alu_scalar.clamp = true;
        }
 }
 
@@ -461,27 +448,12 @@ add_regs_vector_3(struct fd2_compile_context *ctx,
        assert(inst->Instruction.NumDstRegs == 1);
 
        add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-       /* maybe should re-arrange the syntax some day, but
-        * in assembler/disassembler and what ir.c expects
-        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
-        */
-       add_src_reg(ctx, alu, &inst->Src[2].Register);
        add_src_reg(ctx, alu, &inst->Src[0].Register);
        add_src_reg(ctx, alu, &inst->Src[1].Register);
+       add_src_reg(ctx, alu, &inst->Src[2].Register);
        add_vector_clamp(inst, alu);
 }
 
-static void
-add_regs_dummy_vector(struct ir2_instruction *alu)
-{
-       /* create dummy, non-written vector dst/src regs
-        * for unused vector instr slot:
-        */
-       ir2_reg_create(alu, 0, "____", 0); /* vector dst */
-       ir2_reg_create(alu, 0, NULL, 0);   /* vector src1 */
-       ir2_reg_create(alu, 0, NULL, 0);   /* vector src2 */
-}
-
 static void
 add_regs_scalar_1(struct fd2_compile_context *ctx,
                struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
@@ -489,8 +461,6 @@ add_regs_scalar_1(struct fd2_compile_context *ctx,
        assert(inst->Instruction.NumSrcRegs == 1);
        assert(inst->Instruction.NumDstRegs == 1);
 
-       add_regs_dummy_vector(alu);
-
        add_dst_reg(ctx, alu, &inst->Dst[0].Register);
        add_src_reg(ctx, alu, &inst->Src[0].Register);
        add_scalar_clamp(inst, alu);
@@ -567,19 +537,13 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
        struct ir2_instruction *alu;
        struct tgsi_dst_register pred_dst;
 
-       /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
-        * themselves:
-        */
-       ctx->cf = NULL;
-
        if (ctx->pred_depth == 0) {
                /* assign predicate register: */
                ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
 
                get_predicate(ctx, &pred_dst, NULL);
 
-               alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
-               add_regs_dummy_vector(alu);
+               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs);
                add_dst_reg(ctx, alu, &pred_dst);
                add_src_reg(ctx, alu, src);
        } else {
@@ -587,7 +551,7 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
 
                get_predicate(ctx, &pred_dst, &pred_src);
 
-               alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+               alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
                add_dst_reg(ctx, alu, &pred_dst);
                add_src_reg(ctx, alu, &pred_src);
                add_src_reg(ctx, alu, src);
@@ -600,18 +564,11 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
 
        /* save previous pred state to restore in pop_predicate(): */
        ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
-
-       ctx->cf = NULL;
 }
 
 static void
 pop_predicate(struct fd2_compile_context *ctx)
 {
-       /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
-        * themselves:
-        */
-       ctx->cf = NULL;
-
        /* restore previous predicate state: */
        ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
 
@@ -622,8 +579,7 @@ pop_predicate(struct fd2_compile_context *ctx)
 
                get_predicate(ctx, &pred_dst, &pred_src);
 
-               alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
-               add_regs_dummy_vector(alu);
+               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs);
                add_dst_reg(ctx, alu, &pred_dst);
                add_src_reg(ctx, alu, &pred_src);
                alu->pred = IR2_PRED_NONE;
@@ -631,8 +587,6 @@ pop_predicate(struct fd2_compile_context *ctx)
                /* predicate register no longer needed: */
                ctx->pred_reg = -1;
        }
-
-       ctx->cf = NULL;
 }
 
 static void
@@ -693,12 +647,11 @@ translate_pow(struct fd2_compile_context *ctx,
 
        get_internal_temp(ctx, &tmp_dst, &tmp_src);
 
-       alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
-       add_regs_dummy_vector(alu);
+       alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP);
        add_dst_reg(ctx, alu, &tmp_dst);
        add_src_reg(ctx, alu, &inst->Src[0].Register);
 
-       alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+       alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
        add_dst_reg(ctx, alu, &tmp_dst);
        add_src_reg(ctx, alu, &tmp_src);
        add_src_reg(ctx, alu, &inst->Src[1].Register);
@@ -725,8 +678,7 @@ translate_pow(struct fd2_compile_context *ctx,
                break;
        }
 
-       alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
-       add_regs_dummy_vector(alu);
+       alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
        add_dst_reg(ctx, alu, &inst->Dst[0].Register);
        add_src_reg(ctx, alu, &tmp_src);
        add_scalar_clamp(inst, alu);
@@ -737,7 +689,7 @@ translate_tex(struct fd2_compile_context *ctx,
                struct tgsi_full_instruction *inst, unsigned opc)
 {
        struct ir2_instruction *instr;
-       struct ir2_register *reg;
+       struct ir2_src_register *reg;
        struct tgsi_dst_register tmp_dst;
        struct tgsi_src_register tmp_src;
        const struct tgsi_src_register *coord;
@@ -766,19 +718,18 @@ translate_tex(struct fd2_compile_context *ctx,
                 *
                 *  dst = texture_sample(unit, coord, bias)
                 */
-               instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
 
-               /* MAXv: */
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
                add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
                add_src_reg(ctx, instr, &inst->Src[0].Register);
                add_src_reg(ctx, instr, &inst->Src[0].Register);
 
-               /* RECIP_IEEE: */
+               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
                add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
                add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
                                swiz[inst->Src[0].Register.SwizzleW];
 
-               instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
                add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
                add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
                add_src_reg(ctx, instr, &inst->Src[0].Register);
@@ -788,7 +739,7 @@ translate_tex(struct fd2_compile_context *ctx,
                coord = &inst->Src[0].Register;
        }
 
-       instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
+       instr = ir2_instr_create(ctx->so->ir, IR2_FETCH);
        instr->fetch.opc = TEX_FETCH;
        instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
        instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT);
@@ -807,7 +758,7 @@ translate_tex(struct fd2_compile_context *ctx,
                reg->swizzle[2] = reg->swizzle[0];
 
        /* dst register needs to be marked for sync: */
-       ctx->need_sync |= 1 << instr->regs[0]->num;
+       ctx->need_sync |= 1 << instr->dst_reg.num;
 
        /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
        instr->sync = true;
@@ -818,7 +769,7 @@ translate_tex(struct fd2_compile_context *ctx,
                 * the texture to a temp and the use ALU instruction to move
                 * to output
                 */
-               instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
 
                add_dst_reg(ctx, instr, &inst->Dst[0].Register);
                add_src_reg(ctx, instr, &tmp_src);
@@ -869,22 +820,18 @@ translate_sge_slt_seq_sne(struct fd2_compile_context *ctx,
 
        get_internal_temp(ctx, &tmp_dst, &tmp_src);
 
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
        add_dst_reg(ctx, instr, &tmp_dst);
        add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
        add_src_reg(ctx, instr, &inst->Src[1].Register);
 
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), vopc, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, vopc);
        add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       /* maybe should re-arrange the syntax some day, but
-        * in assembler/disassembler and what ir.c expects
-        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
-        */
-       get_immediate(ctx, &tmp_const, fui(c0));
-       add_src_reg(ctx, instr, &tmp_const);
        add_src_reg(ctx, instr, &tmp_src);
        get_immediate(ctx, &tmp_const, fui(c1));
        add_src_reg(ctx, instr, &tmp_const);
+       get_immediate(ctx, &tmp_const, fui(c0));
+       add_src_reg(ctx, instr, &tmp_const);
 }
 
 /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
@@ -904,25 +851,25 @@ translate_lrp(struct fd2_compile_context *ctx,
        get_immediate(ctx, &tmp_const, fui(1.0));
 
        /* tmp1 = (a * b) */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
        add_dst_reg(ctx, instr, &tmp_dst1);
        add_src_reg(ctx, instr, &inst->Src[0].Register);
        add_src_reg(ctx, instr, &inst->Src[1].Register);
 
        /* tmp2 = (1 - a) */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
        add_dst_reg(ctx, instr, &tmp_dst2);
        add_src_reg(ctx, instr, &tmp_const);
        add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
 
        /* tmp2 = tmp2 * c */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
        add_dst_reg(ctx, instr, &tmp_dst2);
        add_src_reg(ctx, instr, &tmp_src2);
        add_src_reg(ctx, instr, &inst->Src[2].Register);
 
        /* dst = tmp1 + tmp2 */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
        add_dst_reg(ctx, instr, &inst->Dst[0].Register);
        add_src_reg(ctx, instr, &tmp_src1);
        add_src_reg(ctx, instr, &tmp_src2);
@@ -956,33 +903,28 @@ translate_trig(struct fd2_compile_context *ctx,
        tmp_src.SwizzleX = tmp_src.SwizzleY =
                        tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
 
-       /* maybe should re-arrange the syntax some day, but
-        * in assembler/disassembler and what ir.c expects
-        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
-        */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
        add_dst_reg(ctx, instr, &tmp_dst);
-       get_immediate(ctx, &tmp_const, fui(0.5));
-       add_src_reg(ctx, instr, &tmp_const);
        add_src_reg(ctx, instr, &inst->Src[0].Register);
        get_immediate(ctx, &tmp_const, fui(0.159155));
        add_src_reg(ctx, instr, &tmp_const);
+       get_immediate(ctx, &tmp_const, fui(0.5));
+       add_src_reg(ctx, instr, &tmp_const);
 
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
        add_dst_reg(ctx, instr, &tmp_dst);
        add_src_reg(ctx, instr, &tmp_src);
        add_src_reg(ctx, instr, &tmp_src);
 
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
+       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
        add_dst_reg(ctx, instr, &tmp_dst);
-       get_immediate(ctx, &tmp_const, fui(-3.141593));
-       add_src_reg(ctx, instr, &tmp_const);
        add_src_reg(ctx, instr, &tmp_src);
        get_immediate(ctx, &tmp_const, fui(6.283185));
        add_src_reg(ctx, instr, &tmp_const);
+       get_immediate(ctx, &tmp_const, fui(-3.141593));
+       add_src_reg(ctx, instr, &tmp_const);
 
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
-       add_regs_dummy_vector(instr);
+       instr = ir2_instr_create_alu_s(ctx->so->ir, op);
        add_dst_reg(ctx, instr, &inst->Dst[0].Register);
        add_src_reg(ctx, instr, &tmp_src);
 }
@@ -996,12 +938,12 @@ translate_dp2(struct fd2_compile_context *ctx,
        struct ir2_instruction *instr;
        /* DP2ADD c,a,b -> dot2(a,b) + c */
        /* for c we use the constant 0.0 */
-       instr = ir2_instr_create_alu(next_exec_cf(ctx), DOT2ADDv, ~0);
-       get_immediate(ctx, &tmp_const, fui(0.0f));
+       instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv);
        add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-       add_src_reg(ctx, instr, &tmp_const);
        add_src_reg(ctx, instr, &inst->Src[0].Register);
        add_src_reg(ctx, instr, &inst->Src[1].Register);
+       get_immediate(ctx, &tmp_const, fui(0.0f));
+       add_src_reg(ctx, instr, &tmp_const);
        add_vector_clamp(inst, instr);
 }
 
@@ -1015,80 +957,53 @@ translate_instruction(struct fd2_compile_context *ctx,
 {
        unsigned opc = inst->Instruction.Opcode;
        struct ir2_instruction *instr;
-       static struct ir2_cf *cf;
 
        if (opc == TGSI_OPCODE_END)
                return;
 
-       if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
-               unsigned num = inst->Dst[0].Register.Index;
-               /* seems like we need to ensure that position vs param/pixel
-                * exports don't end up in the same EXEC clause..  easy way
-                * to do this is force a new EXEC clause on first appearance
-                * of an position or param/pixel export.
-                */
-               if ((num == ctx->position) || (num == ctx->psize)) {
-                       if (ctx->num_position > 0) {
-                               ctx->cf = NULL;
-                               ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
-                                               ctx->num_position - 1);
-                               ctx->num_position = 0;
-                       }
-               } else {
-                       if (ctx->num_param > 0) {
-                               ctx->cf = NULL;
-                               ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
-                                               ctx->num_param - 1);
-                               ctx->num_param = 0;
-                       }
-               }
-       }
-
-       cf = next_exec_cf(ctx);
-
        /* TODO turn this into a table: */
        switch (opc) {
        case TGSI_OPCODE_MOV:
-               instr = ir2_instr_create_alu(cf, MAXv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
                add_regs_vector_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_RCP:
-               instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
+               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
                add_regs_scalar_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_RSQ:
-               instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
+               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE);
                add_regs_scalar_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_SQRT:
-               instr = ir2_instr_create_alu(cf, ~0, SQRT_IEEE);
+               instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE);
                add_regs_scalar_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_MUL:
-               instr = ir2_instr_create_alu(cf, MULv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_ADD:
-               instr = ir2_instr_create_alu(cf, ADDv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_DP2:
                translate_dp2(ctx, inst, opc);
                break;
        case TGSI_OPCODE_DP3:
-               instr = ir2_instr_create_alu(cf, DOT3v, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_DP4:
-               instr = ir2_instr_create_alu(cf, DOT4v, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_MIN:
-               instr = ir2_instr_create_alu(cf, MINv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MINv);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_MAX:
-               instr = ir2_instr_create_alu(cf, MAXv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
                add_regs_vector_2(ctx, inst, instr);
                break;
        case TGSI_OPCODE_SLT:
@@ -1098,22 +1013,22 @@ translate_instruction(struct fd2_compile_context *ctx,
                translate_sge_slt_seq_sne(ctx, inst, opc);
                break;
        case TGSI_OPCODE_MAD:
-               instr = ir2_instr_create_alu(cf, MULADDv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
                add_regs_vector_3(ctx, inst, instr);
                break;
        case TGSI_OPCODE_LRP:
                translate_lrp(ctx, inst, opc);
                break;
        case TGSI_OPCODE_FRC:
-               instr = ir2_instr_create_alu(cf, FRACv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
                add_regs_vector_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_FLR:
-               instr = ir2_instr_create_alu(cf, FLOORv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv);
                add_regs_vector_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_EX2:
-               instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
+               instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
                add_regs_scalar_1(ctx, inst, instr);
                break;
        case TGSI_OPCODE_POW:
@@ -1128,10 +1043,9 @@ translate_instruction(struct fd2_compile_context *ctx,
                translate_tex(ctx, inst, opc);
                break;
        case TGSI_OPCODE_CMP:
-               instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv);
                add_regs_vector_3(ctx, inst, instr);
-               // TODO this should be src0 if regs where in sane order..
-               instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
+               instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */
                break;
        case TGSI_OPCODE_IF:
                push_predicate(ctx, &inst->Src[0].Register);
@@ -1139,16 +1053,12 @@ translate_instruction(struct fd2_compile_context *ctx,
                break;
        case TGSI_OPCODE_ELSE:
                ctx->so->ir->pred = IR2_PRED_NE;
-               /* not sure if this is required in all cases, but blob compiler
-                * won't combine EQ and NE in same CF:
-                */
-               ctx->cf = NULL;
                break;
        case TGSI_OPCODE_ENDIF:
                pop_predicate(ctx);
                break;
        case TGSI_OPCODE_F2I:
-               instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
+               instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv);
                add_regs_vector_1(ctx, inst, instr);
                break;
        default:
@@ -1179,8 +1089,6 @@ compile_instructions(struct fd2_compile_context *ctx)
                        break;
                }
        }
-
-       ctx->cf->cf_type = EXEC_END;
 }
 
 int
index 834a7c7fcd79c2ea6df68816dfc2c649d52d2c87..34622eaba0ea3195ce38a0896c39312d25badf8e 100644 (file)
@@ -199,7 +199,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
                instr->fetch.offset = elem->src_offset;
 
                for (j = 0; j < 4; j++)
-                       instr->regs[0]->swizzle[j] = "xyzw01__"[desc->swizzle[j]];
+                       instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]];
 
                assert(instr->fetch.fmt != ~0);
 
@@ -210,7 +210,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
                                instr->fetch.const_idx,
                                instr->fetch.const_idx_sel,
                                elem->instance_divisor,
-                               instr->regs[0]->swizzle,
+                               instr->dst_reg.swizzle,
                                instr->fetch.stride,
                                instr->fetch.offset);
        }
@@ -307,7 +307,6 @@ static struct fd2_shader_stateobj *
 create_blit_fp(void)
 {
        struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
-       struct ir2_cf *cf;
        struct ir2_instruction *instr;
 
        if (!so)
@@ -315,18 +314,13 @@ create_blit_fp(void)
 
        so->ir = ir2_shader_create();
 
-       cf = ir2_cf_create(so->ir, EXEC);
-
-       instr = ir2_instr_create_tex_fetch(cf, 0);
-       ir2_reg_create(instr, 0, "xyzw", 0);
-       ir2_reg_create(instr, 0, "xyx", 0);
+       instr = ir2_instr_create_tex_fetch(so->ir, 0);
+       ir2_dst_create(instr, 0, "xyzw", 0);
+       ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT);
        instr->sync = true;
 
-       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
-       cf = ir2_cf_create(so->ir, EXEC_END);
-
-       instr = ir2_instr_create_alu(cf, MAXv, ~0);
-       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
+       instr = ir2_instr_create_alu_v(so->ir, MAXv);
+       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
        ir2_reg_create(instr, 0, NULL, 0);
        ir2_reg_create(instr, 0, NULL, 0);
 
@@ -349,7 +343,6 @@ static struct fd2_shader_stateobj *
 create_blit_vp(void)
 {
        struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
-       struct ir2_cf *cf;
        struct ir2_instruction *instr;
 
        if (!so)
@@ -357,31 +350,23 @@ create_blit_vp(void)
 
        so->ir = ir2_shader_create();
 
-       cf = ir2_cf_create(so->ir, EXEC);
-
-       instr = ir2_instr_create_vtx_fetch(cf, 26, 1, FMT_32_32_FLOAT, false, 8);
+       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8);
        instr->fetch.is_normalized = true;
-       ir2_reg_create(instr, 1, "xy01", 0);
-       ir2_reg_create(instr, 0, "x", 0);
+       ir2_dst_create(instr, 1, "xy01", 0);
+       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
 
-       instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
+       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
        instr->fetch.is_normalized = true;
-       ir2_reg_create(instr, 2, "xyz1", 0);
-       ir2_reg_create(instr, 0, "x", 0);
-
-       cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
-       cf = ir2_cf_create(so->ir, EXEC);
+       ir2_dst_create(instr, 2, "xyz1", 0);
+       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
 
-       instr = ir2_instr_create_alu(cf, MAXv, ~0);
-       ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
+       instr = ir2_instr_create_alu_v(so->ir, MAXv);
+       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
        ir2_reg_create(instr, 2, NULL, 0);
        ir2_reg_create(instr, 2, NULL, 0);
 
-       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
-       cf = ir2_cf_create(so->ir, EXEC_END);
-
-       instr = ir2_instr_create_alu(cf, MAXv, ~0);
-       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
+       instr = ir2_instr_create_alu_v(so->ir, MAXv);
+       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
        ir2_reg_create(instr, 1, NULL, 0);
        ir2_reg_create(instr, 1, NULL, 0);
 
@@ -397,7 +382,6 @@ static struct fd2_shader_stateobj *
 create_solid_fp(void)
 {
        struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
-       struct ir2_cf *cf;
        struct ir2_instruction *instr;
 
        if (!so)
@@ -405,11 +389,8 @@ create_solid_fp(void)
 
        so->ir = ir2_shader_create();
 
-       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
-       cf = ir2_cf_create(so->ir, EXEC_END);
-
-       instr = ir2_instr_create_alu(cf, MAXv, ~0);
-       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
+       instr = ir2_instr_create_alu_v(so->ir, MAXv);
+       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
        ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
        ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
 
@@ -430,7 +411,6 @@ static struct fd2_shader_stateobj *
 create_solid_vp(void)
 {
        struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
-       struct ir2_cf *cf;
        struct ir2_instruction *instr;
 
        if (!so)
@@ -438,22 +418,15 @@ create_solid_vp(void)
 
        so->ir = ir2_shader_create();
 
-       cf = ir2_cf_create(so->ir, EXEC);
-
-       instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
-       ir2_reg_create(instr, 1, "xyz1", 0);
-       ir2_reg_create(instr, 0, "x", 0);
-
-       cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
-       cf = ir2_cf_create(so->ir, EXEC);
+       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
+       ir2_dst_create(instr, 1, "xyz1", 0);
+       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
 
-       instr = ir2_instr_create_alu(cf, MAXv, ~0);
-       ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
+       instr = ir2_instr_create_alu_v(so->ir, MAXv);
+       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
        ir2_reg_create(instr, 1, NULL, 0);
        ir2_reg_create(instr, 1, NULL, 0);
 
-       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
-       cf = ir2_cf_create(so->ir, EXEC_END);
 
        return assemble(so);
 }
index ac972ed35a1a3dfea8b867f416dc79005e696ee7..5a9f93ec794d798e7fcf05b6fcec7b7c117f7f0a 100644 (file)
@@ -147,15 +147,25 @@ typedef struct PACKED {
        uint8_t             const_1_rel_abs          : 1;
        uint8_t             const_0_rel_abs          : 1;
        /* dword2: */
-       uint8_t             src3_reg                 : 6;
-       uint8_t             src3_reg_select          : 1;
-       uint8_t             src3_reg_abs             : 1;
-       uint8_t             src2_reg                 : 6;
-       uint8_t             src2_reg_select          : 1;
-       uint8_t             src2_reg_abs             : 1;
-       uint8_t             src1_reg                 : 6;
-       uint8_t             src1_reg_select          : 1;
-       uint8_t             src1_reg_abs             : 1;
+       union {
+               struct {
+                       uint8_t             src3_reg         : 6;
+                       uint8_t             src3_reg_select  : 1;
+                       uint8_t             src3_reg_abs     : 1;
+                       uint8_t             src2_reg         : 6;
+                       uint8_t             src2_reg_select  : 1;
+                       uint8_t             src2_reg_abs     : 1;
+                       uint8_t             src1_reg         : 6;
+                       uint8_t             src1_reg_select  : 1;
+                       uint8_t             src1_reg_abs     : 1;
+               };
+               /* constants have full 8-bit index */
+               struct {
+                       uint8_t             src3_reg_const   : 8;
+                       uint8_t             src2_reg_const   : 8;
+                       uint8_t             src1_reg_const   : 8;
+               };
+       };
        instr_vector_opc_t  vector_opc               : 5;
        uint8_t             src3_sel                 : 1;
        uint8_t             src2_sel                 : 1;
index 42a9ab494e6ca649d231ef29f71393f8838b94b3..af9811864ffa2ded07e39087992278c5cc6ad6c0 100644 (file)
 #define WARN_MSG(f, ...)   DBG("WARN:  "f, ##__VA_ARGS__)
 #define ERROR_MSG(f, ...)  DBG("ERROR: "f, ##__VA_ARGS__)
 
-#define REG_MASK 0x3f
-
-static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr);
-
 static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
                uint32_t idx, struct ir2_shader_info *info);
 
-static void reg_update_stats(struct ir2_register *reg,
-               struct ir2_shader_info *info, bool dest);
-static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n);
-static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg);
-static uint32_t reg_alu_dst_swiz(struct ir2_register *reg);
-static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
+static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
+static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
+static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
+static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
 
 /* simple allocator to carve allocations out of an up-front allocated heap,
  * so that we can free everything easily in one shot.
@@ -55,7 +49,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
 static void * ir2_alloc(struct ir2_shader *shader, int sz)
 {
        void *ptr = &shader->heap[shader->heap_idx];
-       shader->heap_idx += align(sz, 4);
+       shader->heap_idx += align(sz, 4) / 4;
        return ptr;
 }
 
@@ -74,7 +68,9 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str)
 struct ir2_shader * ir2_shader_create(void)
 {
        DEBUG_MSG("");
-       return calloc(1, sizeof(struct ir2_shader));
+       struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
+       shader->max_reg = -1;
+       return shader;
 }
 
 void ir2_shader_destroy(struct ir2_shader *shader)
@@ -83,189 +79,344 @@ void ir2_shader_destroy(struct ir2_shader *shader)
        free(shader);
 }
 
-/* resolve addr/cnt/sequence fields in the individual CF's */
-static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info)
+/* check if an instruction is a simple MOV
+ */
+static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
+               bool output)
 {
-       uint32_t addr;
-       unsigned i;
-       int j;
-
-       addr = shader->cfs_count / 2;
-       for (i = 0; i < shader->cfs_count; i++) {
-               struct ir2_cf *cf = shader->cfs[i];
-               if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
-                       uint32_t sequence = 0;
-
-                       if (cf->exec.addr && (cf->exec.addr != addr))
-                               WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i);
-                       if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count))
-                               WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i);
-
-                       for (j = cf->exec.instrs_count - 1; j >= 0; j--) {
-                               struct ir2_instruction *instr = cf->exec.instrs[j];
-                               sequence <<= 2;
-                               if (instr->instr_type == IR2_FETCH)
-                                       sequence |= 0x1;
-                               if (instr->sync)
-                                       sequence |= 0x2;
-                       }
+    struct ir2_src_register *src_reg = instr->src_reg;
+    struct ir2_dst_register *dst_reg = &instr->dst_reg;
+    struct ir2_register *reg;
+    unsigned i;
+
+    /* MAXv used for MOV */
+    if (instr->instr_type != IR2_ALU_VECTOR ||
+               instr->alu_vector.opc != MAXv)
+               return NULL;
+
+       /* non identical srcs */
+       if (src_reg[0].num != src_reg[1].num)
+               return NULL;
+
+       /* flags */
+       int flags = IR2_REG_NEGATE | IR2_REG_ABS;
+       if (output)
+               flags |= IR2_REG_INPUT | IR2_REG_CONST;
+       if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
+               return NULL;
+
+       /* clamping */
+       if (instr->alu_vector.clamp)
+               return NULL;
+
+       /* swizzling */
+    for (i = 0; i < 4; i++) {
+               char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
+               if (swiz == '_')
+                       continue;
+
+               if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
+                       swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
+                       return NULL;
+    }
+
+    if (output)
+               reg = &instr->shader->reg[src_reg[0].num];
+       else
+               reg = &instr->shader->reg[dst_reg->num];
+
+       assert(reg->write_idx >= 0);
+    if (reg->write_idx != reg->write_idx2)
+               return NULL;
+
+       if (!output)
+               return instr;
+
+       instr = instr->shader->instr[reg->write_idx];
+       return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
+}
 
-                       cf->exec.addr = addr;
-                       cf->exec.cnt  = cf->exec.instrs_count;
-                       cf->exec.sequence = sequence;
+static int src_to_reg(struct ir2_instruction *instr,
+               struct ir2_src_register *reg)
+{
+       if (reg->flags & IR2_REG_CONST)
+               return reg->num;
 
-                       addr += cf->exec.instrs_count;
-               }
-       }
+       return instr->shader->reg[reg->num].reg;
+}
+
+static int dst_to_reg(struct ir2_instruction *instr,
+               struct ir2_dst_register *reg)
+{
+       if (reg->flags & IR2_REG_EXPORT)
+               return reg->num;
 
-       info->sizedwords = 3 * addr;
+       return instr->shader->reg[reg->num].reg;
+}
 
-       return 0;
+static bool mask_get(uint32_t *mask, unsigned index)
+{
+    return !!(mask[index / 32] & 1 << index % 32);
 }
 
-void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info)
+static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
 {
-       uint32_t i, j;
-       uint32_t *ptr, *dwords = NULL;
-       uint32_t idx = 0;
-       int ret;
-
-       info->sizedwords    = 0;
-       info->max_reg       = -1;
-       info->max_input_reg = 0;
-       info->regs_written  = 0;
-
-       /* we need an even # of CF's.. insert a NOP if needed */
-       if (shader->cfs_count != align(shader->cfs_count, 2))
-               ir2_cf_create(shader, NOP);
-
-       /* first pass, resolve sizes and addresses: */
-       ret = shader_resolve(shader, info);
-       if (ret) {
-               ERROR_MSG("resolve failed: %d", ret);
-               goto fail;
+       if (reg) {
+               unsigned i;
+               for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
+                       mask[i] |= reg->regmask[i];
        }
+       if (index >= 0)
+               mask[index / 32] |= 1 << index % 32;
+}
 
-       ptr = dwords = calloc(4, info->sizedwords);
+static bool sets_pred(struct ir2_instruction *instr)
+{
+    return instr->instr_type == IR2_ALU_SCALAR &&
+               instr->alu_scalar.opc >= PRED_SETEs &&
+               instr->alu_scalar.opc <= PRED_SET_RESTOREs;
+}
 
-       /* second pass, emit CF program in pairs: */
-       for (i = 0; i < shader->cfs_count; i += 2) {
-               instr_cf_t *cfs = (instr_cf_t *)ptr;
-               ret = cf_emit(shader->cfs[i], &cfs[0]);
-               if (ret) {
-                       ERROR_MSG("CF emit failed: %d\n", ret);
-                       goto fail;
+
+
+void* ir2_shader_assemble(struct ir2_shader *shader,
+               struct ir2_shader_info *info)
+{
+       /* NOTES
+        * blob compiler seems to always puts PRED_* instrs in a CF by
+        * themselves, and wont combine EQ/NE in the same CF
+        * (not doing this - doesn't seem to make a difference)
+        *
+        * TODO: implement scheduling for combining vector+scalar instructions
+        * -some vector instructions can be replaced by scalar
+        */
+
+       /* first step:
+        * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
+        * 2. track information for register allocation, and to remove
+        * the dead code when some exports are not needed
+        * 3. add additional instructions for a20x hw binning if needed
+        * NOTE: modifies the shader instrs
+        * this step could be done as instructions are added by compiler instead
+        */
+
+       /* mask of exports that must be generated
+        * used to avoid calculating ps exports with hw binning
+       */
+       uint64_t export = ~0ull;
+       /* bitmask of variables required for exports defined by "export" */
+       uint32_t export_mask[REG_MASK/32+1] = {};
+
+       unsigned idx, reg_idx;
+       unsigned max_input = 0;
+       int export_size = -1;
+
+       for (idx = 0; idx < shader->instr_count; idx++) {
+               struct ir2_instruction *instr = shader->instr[idx], *prev;
+               struct ir2_dst_register dst_reg = instr->dst_reg;
+
+               if (dst_reg.flags & IR2_REG_EXPORT) {
+                       if (dst_reg.num < 32)
+                               export_size++;
+
+                       if ((prev = simple_mov(instr, true))) {
+                               /* copy instruction but keep dst */
+                               *instr = *prev;
+                               instr->dst_reg = dst_reg;
+                       }
                }
-               ret = cf_emit(shader->cfs[i+1], &cfs[1]);
-               if (ret) {
-                       ERROR_MSG("CF emit failed: %d\n", ret);
-                       goto fail;
+
+               for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
+                       struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
+                       struct ir2_register *reg;
+                       int num;
+
+                       if (src_reg->flags & IR2_REG_CONST)
+                               continue;
+
+                       num = src_reg->num;
+                       reg = &shader->reg[num];
+                       reg->read_idx = idx;
+
+                       if (src_reg->flags & IR2_REG_INPUT) {
+                               max_input = MAX2(max_input, num);
+                       } else {
+                               /* bypass simple mov used to set src_reg */
+                               assert(reg->write_idx >= 0);
+                               prev = shader->instr[reg->write_idx];
+                               if (simple_mov(prev, false)) {
+                                       *src_reg = prev->src_reg[0];
+                                       /* process same src_reg again */
+                                       reg_idx -= 1;
+                                       continue;
+                               }
+                       }
+
+                       /* update dependencies */
+                       uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
+                                       export_mask : shader->reg[dst_reg.num].regmask;
+                       mask_set(mask, reg, num);
+                       if (sets_pred(instr))
+                               mask_set(export_mask, reg, num);
                }
-               ptr += 3;
-               assert((ptr - dwords) <= info->sizedwords);
        }
 
-       /* third pass, emit ALU/FETCH: */
-       for (i = 0; i < shader->cfs_count; i++) {
-               struct ir2_cf *cf = shader->cfs[i];
-               if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
-                       for (j = 0; j < cf->exec.instrs_count; j++) {
-                               ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info);
-                               if (ret) {
-                                       ERROR_MSG("instruction emit failed: %d", ret);
-                                       goto fail;
-                               }
-                               ptr += 3;
-                               assert((ptr - dwords) <= info->sizedwords);
+       /* second step:
+        * emit instructions (with CFs) + RA
+        */
+       instr_cf_t cfs[128], *cf = cfs;
+       uint32_t alufetch[3*256], *af = alufetch;
+
+       /* RA is done on write, so inputs must be allocated here */
+       for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
+               shader->reg[reg_idx].reg = reg_idx;
+       info->max_reg = max_input;
+
+       /* CF instr state */
+       instr_cf_exec_t exec = { .opc = EXEC };
+       instr_cf_alloc_t alloc = { .opc = ALLOC };
+       bool need_alloc = 0;
+       bool pos_export = 0;
+
+       export_size = MAX2(export_size, 0);
+
+       for (idx = 0; idx < shader->instr_count; idx++) {
+               struct ir2_instruction *instr = shader->instr[idx];
+               struct ir2_dst_register *dst_reg = &instr->dst_reg;
+               unsigned num = dst_reg->num;
+               struct ir2_register *reg;
+
+               /* a2xx only has 64 registers, so we can use a single 64-bit mask */
+               uint64_t regmask = 0ull;
+
+               /* compute the current regmask */
+               for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
+                       reg = &shader->reg[reg_idx];
+                       if ((int) idx > reg->write_idx && idx < reg->read_idx)
+                               regmask |= (1ull << reg->reg);
+               }
+
+               if (dst_reg->flags & IR2_REG_EXPORT) {
+                       /* skip if export is not needed */
+                       if (!(export & (1ull << num)))
+                               continue;
+
+            /* ALLOC CF:
+             * want to alloc all < 32 at once
+                        * 32/33 and 62/63 come in pairs
+                        * XXX assuming all 3 types are never interleaved
+                        */
+            if (num < 32) {
+                               alloc.size = export_size;
+                               alloc.buffer_select = SQ_PARAMETER_PIXEL;
+                               need_alloc = export_size >= 0;
+                               export_size = -1;
+                       } else if (num == 32 || num == 33) {
+                               alloc.size = 0;
+                               alloc.buffer_select = SQ_MEMORY;
+                               need_alloc = num != 33;
+                       } else {
+                               alloc.size = 0;
+                               alloc.buffer_select = SQ_POSITION;
+                               need_alloc = !pos_export;
+                               pos_export = true;
                        }
+
+               } else {
+                       /* skip if dst register not needed to compute exports */
+                       if (!mask_get(export_mask, num))
+                               continue;
+
+                       /* RA on first write */
+                       reg = &shader->reg[num];
+                       if (reg->write_idx == idx) {
+                               reg->reg = ffsll(~regmask) - 1;
+                               info->max_reg = MAX2(info->max_reg, reg->reg);
+                       }
+               }
+
+               if (exec.count == 6 || (exec.count && need_alloc)) {
+                       *cf++ = *(instr_cf_t*) &exec;
+                       exec.address += exec.count;
+                       exec.serialize = 0;
+                       exec.count = 0;
                }
+
+               if (need_alloc) {
+                       *cf++ = *(instr_cf_t*) &alloc;
+                       need_alloc = false;
+               }
+
+               int ret = instr_emit(instr, af, idx, info); af += 3;
+               assert(!ret);
+
+               if (instr->instr_type == IR2_FETCH)
+                       exec.serialize |= 0x1 << exec.count * 2;
+               if (instr->sync)
+                       exec.serialize |= 0x2 << exec.count * 2;
+                exec.count += 1;
        }
 
-       return dwords;
 
-fail:
-       free(dwords);
-       return NULL;
-}
+       exec.opc = !export_size ? EXEC : EXEC_END;
+       *cf++ = *(instr_cf_t*) &exec;
+       exec.address += exec.count;
+       exec.serialize = 0;
+       exec.count = 0;
 
+       /* GPU will hang without at least one pixel alloc */
+       if (!export_size) {
+               alloc.size = 0;
+               alloc.buffer_select = SQ_PARAMETER_PIXEL;
+               *cf++ = *(instr_cf_t*) &alloc;
 
-struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type)
-{
-       struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf));
-       DEBUG_MSG("%d", cf_type);
-       cf->shader = shader;
-       cf->cf_type = cf_type;
-       assert(shader->cfs_count < ARRAY_SIZE(shader->cfs));
-       shader->cfs[shader->cfs_count++] = cf;
-       return cf;
-}
+               exec.opc = EXEC_END;
+               *cf++ = *(instr_cf_t*) &exec;
+       }
 
+       unsigned num_cfs = cf - cfs;
 
-/*
- * CF instructions:
- */
+       /* insert nop to get an even # of CFs */
+       if (num_cfs % 2) {
+               *cf++ = (instr_cf_t) { .opc = NOP };
+               num_cfs++;
+       }
 
-static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr)
-{
-       memset(instr, 0, sizeof(*instr));
-
-       instr->opc = cf->cf_type;
-
-       switch (cf->cf_type) {
-       case NOP:
-               break;
-       case EXEC:
-       case EXEC_END:
-               assert(cf->exec.addr <= 0x1ff);
-               assert(cf->exec.cnt <= 0x6);
-               assert(cf->exec.sequence <= 0xfff);
-               instr->exec.address = cf->exec.addr;
-               instr->exec.count = cf->exec.cnt;
-               instr->exec.serialize = cf->exec.sequence;
-               break;
-       case ALLOC:
-               assert(cf->alloc.size <= 0xf);
-               instr->alloc.size = cf->alloc.size;
-               switch (cf->alloc.type) {
-               case SQ_POSITION:
-               case SQ_PARAMETER_PIXEL:
-                       instr->alloc.buffer_select = cf->alloc.type;
+       /* offset cf addrs */
+       for (idx = 0; idx < num_cfs; idx++) {
+        switch (cfs[idx].opc) {
+               case EXEC:
+               case EXEC_END:
+                       cfs[idx].exec.address += num_cfs / 2;
                        break;
                default:
-                       ERROR_MSG("invalid alloc type: %d", cf->alloc.type);
-                       return -1;
+                       break;
+               /* XXX  and any other address using cf that gets implemented */
                }
-               break;
-       case COND_EXEC:
-       case COND_EXEC_END:
-       case COND_PRED_EXEC:
-       case COND_PRED_EXEC_END:
-       case LOOP_START:
-       case LOOP_END:
-       case COND_CALL:
-       case RETURN:
-       case COND_JMP:
-       case COND_EXEC_PRED_CLEAN:
-       case COND_EXEC_PRED_CLEAN_END:
-       case MARK_VS_FETCH_DONE:
-               ERROR_MSG("TODO");
-               return -1;
        }
 
-       return 0;
+       /* concatenate cfs+alufetchs */
+       uint32_t cfdwords = num_cfs / 2 * 3;
+       uint32_t alufetchdwords = exec.address * 3;
+       info->sizedwords = cfdwords + alufetchdwords;
+       uint32_t *dwords = malloc(info->sizedwords * 4);
+       assert(dwords);
+       memcpy(dwords, cfs, cfdwords * 4);
+       memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
+       return dwords;
 }
 
-
-struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type)
+struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
+               int instr_type)
 {
        struct ir2_instruction *instr =
-                       ir2_alloc(cf->shader, sizeof(struct ir2_instruction));
+                       ir2_alloc(shader, sizeof(struct ir2_instruction));
        DEBUG_MSG("%d", instr_type);
-       instr->shader = cf->shader;
-       instr->pred = cf->shader->pred;
+       instr->shader = shader;
+       instr->idx = shader->instr_count;
+       instr->pred = shader->pred;
        instr->instr_type = instr_type;
-       assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs));
-       cf->exec.instrs[cf->exec.instrs_count++] = instr;
+       shader->instr[shader->instr_count++] = instr;
        return instr;
 }
 
@@ -279,15 +430,11 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
                struct ir2_shader_info *info)
 {
        instr_fetch_t *fetch = (instr_fetch_t *)dwords;
-       int reg = 0;
-       struct ir2_register *dst_reg = instr->regs[reg++];
-       struct ir2_register *src_reg = instr->regs[reg++];
+       struct ir2_dst_register *dst_reg = &instr->dst_reg;
+       struct ir2_src_register *src_reg = &instr->src_reg[0];
 
        memset(fetch, 0, sizeof(*fetch));
 
-       reg_update_stats(dst_reg, info, true);
-       reg_update_stats(src_reg, info, false);
-
        fetch->opc = instr->fetch.opc;
 
        if (instr->fetch.opc == VTX_FETCH) {
@@ -298,9 +445,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
                assert(instr->fetch.const_idx <= 0x1f);
                assert(instr->fetch.const_idx_sel <= 0x3);
 
-               vtx->src_reg = src_reg->num;
+               vtx->src_reg = src_to_reg(instr, src_reg);
                vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
-               vtx->dst_reg = dst_reg->num;
+               vtx->dst_reg = dst_to_reg(instr, dst_reg);
                vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
                vtx->must_be_one = 1;
                vtx->const_index = instr->fetch.const_idx;
@@ -326,9 +473,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
 
                assert(instr->fetch.const_idx <= 0x1f);
 
-               tex->src_reg = src_reg->num;
+               tex->src_reg = src_to_reg(instr, src_reg);
                tex->src_swiz = reg_fetch_src_swiz(src_reg, 3);
-               tex->dst_reg = dst_reg->num;
+               tex->dst_reg = dst_to_reg(instr, dst_reg);
                tex->dst_swiz = reg_fetch_dst_swiz(dst_reg);
                tex->const_idx = instr->fetch.const_idx;
                tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
@@ -360,95 +507,62 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
  * ALU instructions:
  */
 
-static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords,
+static int instr_emit_alu(struct ir2_instruction *instr_v,
+               struct ir2_instruction *instr_s, uint32_t *dwords,
                struct ir2_shader_info *info)
 {
-       int reg = 0;
        instr_alu_t *alu = (instr_alu_t *)dwords;
-       struct ir2_register *dst_reg  = instr->regs[reg++];
-       struct ir2_register *src1_reg;
-       struct ir2_register *src2_reg;
-       struct ir2_register *src3_reg;
+       struct ir2_dst_register *vdst_reg, *sdst_reg;
+       struct ir2_src_register *src1_reg, *src2_reg, *src3_reg;
+       struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader;
+       enum ir2_pred pred = IR2_PRED_NONE;
 
        memset(alu, 0, sizeof(*alu));
 
-       /* handle instructions w/ 3 src operands: */
-       switch (instr->alu.vector_opc) {
-       case MULADDv:
-       case CNDEv:
-       case CNDGTEv:
-       case CNDGTv:
-       case DOT2ADDv:
-               /* note: disassembler lists 3rd src first, ie:
-                *   MULADDv Rdst = Rsrc3 + (Rsrc1 * Rsrc2)
-                * which is the reason for this strange ordering.
-                */
-               src3_reg = instr->regs[reg++];
-               break;
-       default:
-               src3_reg = NULL;
-               break;
+       vdst_reg = NULL;
+       sdst_reg = NULL;
+       src1_reg = NULL;
+       src2_reg = NULL;
+       src3_reg = NULL;
+
+       if (instr_v) {
+               vdst_reg = &instr_v->dst_reg;
+               assert(instr_v->src_reg_count >= 2);
+               src1_reg = &instr_v->src_reg[0];
+               src2_reg = &instr_v->src_reg[1];
+               if (instr_v->src_reg_count > 2)
+                       src3_reg = &instr_v->src_reg[2];
+               pred = instr_v->pred;
        }
 
-       src1_reg = instr->regs[reg++];
-       src2_reg = instr->regs[reg++];
-
-       reg_update_stats(dst_reg, info, true);
-       reg_update_stats(src1_reg, info, false);
-       reg_update_stats(src2_reg, info, false);
-
-       assert((dst_reg->flags & ~IR2_REG_EXPORT) == 0);
-       assert(!dst_reg->swizzle || (strlen(dst_reg->swizzle) == 4));
-       assert((src1_reg->flags & IR2_REG_EXPORT) == 0);
-       assert(!src1_reg->swizzle || (strlen(src1_reg->swizzle) == 4));
-       assert((src2_reg->flags & IR2_REG_EXPORT) == 0);
-       assert(!src2_reg->swizzle || (strlen(src2_reg->swizzle) == 4));
+       if (instr_s) {
+               sdst_reg = &instr_s->dst_reg;
+               assert(instr_s->src_reg_count == 1);
+               assert(!instr_v || vdst_reg->flags == sdst_reg->flags);
+               assert(!instr_v || pred == instr_s->pred);
+               if (src3_reg) {
+                       assert(src3_reg->flags == instr_s->src_reg[0].flags);
+                       assert(src3_reg->num == instr_s->src_reg[0].num);
+                       assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle));
+               }
+               src3_reg = &instr_s->src_reg[0];
+               pred = instr_s->pred;
+       }
 
-       if (instr->alu.vector_opc == (instr_vector_opc_t)~0) {
-               alu->vector_opc          = MAXv;
-               alu->vector_write_mask   = 0;
+       if (vdst_reg) {
+               assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0);
+               assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4));
+               alu->vector_opc          = instr_v->alu_vector.opc;
+               alu->vector_write_mask   = reg_alu_dst_swiz(vdst_reg);
+               alu->vector_dest         = dst_to_reg(instr_v, vdst_reg);
        } else {
-               alu->vector_opc          = instr->alu.vector_opc;
-               alu->vector_write_mask   = reg_alu_dst_swiz(dst_reg);
+               alu->vector_opc          = MAXv;
        }
 
-       alu->vector_dest         = dst_reg->num;
-       alu->export_data         = !!(dst_reg->flags & IR2_REG_EXPORT);
-
-       // TODO predicate case/condition.. need to add to parser
-
-       alu->src2_reg            = src2_reg->num;
-       alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
-       alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
-       alu->src2_reg_abs        = !!(src2_reg->flags & IR2_REG_ABS);
-       alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
-
-       alu->src1_reg            = src1_reg->num;
-       alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
-       alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
-       alu->src1_reg_abs        = !!(src1_reg->flags & IR2_REG_ABS);
-       alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
-
-       alu->vector_clamp        = instr->alu.vector_clamp;
-       alu->scalar_clamp        = instr->alu.scalar_clamp;
-
-       if (instr->alu.scalar_opc != (instr_scalar_opc_t)~0) {
-               struct ir2_register *sdst_reg = instr->regs[reg++];
-
-               reg_update_stats(sdst_reg, info, true);
-
-               assert(sdst_reg->flags == dst_reg->flags);
-
-               if (src3_reg) {
-                       assert(src3_reg == instr->regs[reg]);
-                       reg++;
-               } else {
-                       src3_reg = instr->regs[reg++];
-               }
-
-               alu->scalar_dest         = sdst_reg->num;
+       if (sdst_reg) {
+               alu->scalar_opc          = instr_s->alu_scalar.opc;
                alu->scalar_write_mask   = reg_alu_dst_swiz(sdst_reg);
-               alu->scalar_opc          = instr->alu.scalar_opc;
+               alu->scalar_dest         = dst_to_reg(instr_s, sdst_reg);
        } else {
                /* not sure if this is required, but adreno compiler seems
                 * to always set scalar opc to MAXs if it is not used:
@@ -456,13 +570,58 @@ static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords,
                alu->scalar_opc = MAXs;
        }
 
-       if (src3_reg) {
-               reg_update_stats(src3_reg, info, false);
+       alu->export_data =
+               !!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT);
 
-               alu->src3_reg            = src3_reg->num;
+       /* export32 has this bit set.. it seems to do more than just set
+        * the base address of the constants used to zero
+        * TODO make this less of a hack
+        */
+       if (alu->export_data && alu->vector_dest == 32) {
+               assert(!instr_s);
+               alu->relative_addr = 1;
+       }
+
+       if (src1_reg) {
+               if (src1_reg->flags & IR2_REG_CONST) {
+                       assert(!(src1_reg->flags & IR2_REG_ABS));
+                       alu->src1_reg_const  = src1_reg->num;
+               } else {
+                       alu->src1_reg        = shader->reg[src1_reg->num].reg;
+                       alu->src1_reg_abs    = !!(src1_reg->flags & IR2_REG_ABS);
+               }
+               alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
+               alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
+               alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
+    }  else {
+               alu->src1_sel = 1;
+       }
+
+    if (src2_reg) {
+               if (src2_reg->flags & IR2_REG_CONST) {
+                       assert(!(src2_reg->flags & IR2_REG_ABS));
+                       alu->src2_reg_const  = src2_reg->num;
+               } else {
+                       alu->src2_reg        = shader->reg[src2_reg->num].reg;
+                       alu->src2_reg_abs    = !!(src2_reg->flags & IR2_REG_ABS);
+               }
+               alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
+               alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
+               alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
+    } else {
+               alu->src2_sel = 1;
+    }
+
+    if (src3_reg) {
+               if (src3_reg->flags & IR2_REG_CONST) {
+                       assert(!(src3_reg->flags & IR2_REG_ABS));
+                       alu->src3_reg_const  = src3_reg->num;
+               } else {
+                       alu->src3_reg        = shader->reg[src3_reg->num].reg;
+                       alu->src3_reg_abs    = !!(src3_reg->flags & IR2_REG_ABS);
+               }
                alu->src3_swiz           = reg_alu_src_swiz(src3_reg);
                alu->src3_reg_negate     = !!(src3_reg->flags & IR2_REG_NEGATE);
-               alu->src3_reg_abs        = !!(src3_reg->flags & IR2_REG_ABS);
                alu->src3_sel            = !(src3_reg->flags & IR2_REG_CONST);
        } else {
                /* not sure if this is required, but adreno compiler seems
@@ -471,9 +630,11 @@ static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords,
                alu->src3_sel = 1;
        }
 
-       if (instr->pred != IR2_PRED_NONE) {
-               alu->pred_select = (instr->pred == IR2_PRED_EQ) ? 3 : 2;
-       }
+       alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0;
+       alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0;
+
+       if (pred != IR2_PRED_NONE)
+               alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2;
 
        return 0;
 }
@@ -483,51 +644,63 @@ static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
 {
        switch (instr->instr_type) {
        case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info);
-       case IR2_ALU:   return instr_emit_alu(instr, dwords, info);
+       case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info);
+       case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info);
        }
        return -1;
 }
 
-
-struct ir2_register * ir2_reg_create(struct ir2_instruction *instr,
+struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
                int num, const char *swizzle, int flags)
 {
-       struct ir2_register *reg =
-                       ir2_alloc(instr->shader, sizeof(struct ir2_register));
-       DEBUG_MSG("%x, %d, %s", flags, num, swizzle);
-       assert(num <= REG_MASK);
+       if (!(flags & IR2_REG_EXPORT)) {
+               struct ir2_register *reg = &instr->shader->reg[num];
+
+               unsigned i;
+               for (i = instr->shader->max_reg + 1; i <= num; i++)
+                       instr->shader->reg[i].write_idx = -1;
+               instr->shader->max_reg = i - 1;
+
+               if (reg->write_idx < 0)
+            reg->write_idx = instr->idx;
+               reg->write_idx2 = instr->idx;
+       }
+
+       struct ir2_dst_register *reg = &instr->dst_reg;
        reg->flags = flags;
        reg->num = num;
        reg->swizzle = ir2_strdup(instr->shader, swizzle);
-       assert(instr->regs_count < ARRAY_SIZE(instr->regs));
-       instr->regs[instr->regs_count++] = reg;
        return reg;
 }
 
-static void reg_update_stats(struct ir2_register *reg,
-               struct ir2_shader_info *info, bool dest)
+struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
+               int num, const char *swizzle, int flags)
 {
-       if (!(reg->flags & (IR2_REG_CONST|IR2_REG_EXPORT))) {
-               info->max_reg = MAX2(info->max_reg, reg->num);
-
-               if (dest) {
-                       info->regs_written |= (1 << reg->num);
-               } else if (!(info->regs_written & (1 << reg->num))) {
-                       /* for registers that haven't been written, they must be an
-                        * input register that the thread scheduler (presumably?)
-                        * needs to know about:
-                        */
-                       info->max_input_reg = MAX2(info->max_input_reg, reg->num);
-               }
+       assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg));
+       if (!(flags & IR2_REG_CONST)) {
+               struct ir2_register *reg = &instr->shader->reg[num];
+
+               reg->read_idx = instr->idx;
+
+               unsigned i;
+               for (i = instr->shader->max_reg + 1; i <= num; i++)
+                       instr->shader->reg[i].write_idx = -1;
+               instr->shader->max_reg = i - 1;
        }
+
+       struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++];
+       reg->flags = flags;
+       reg->num = num;
+       reg->swizzle = ir2_strdup(instr->shader, swizzle);
+       return reg;
 }
 
-static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n)
+static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n)
 {
        uint32_t swiz = 0;
        int i;
 
-       assert(reg->flags == 0);
+       assert((reg->flags & ~IR2_REG_INPUT) == 0);
        assert(reg->swizzle);
 
        DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle);
@@ -547,7 +720,7 @@ static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n)
        return swiz;
 }
 
-static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg)
+static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg)
 {
        uint32_t swiz = 0;
        int i;
@@ -580,7 +753,7 @@ static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg)
 }
 
 /* actually, a write-mask */
-static uint32_t reg_alu_dst_swiz(struct ir2_register *reg)
+static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg)
 {
        uint32_t swiz = 0;
        int i;
@@ -607,12 +780,11 @@ static uint32_t reg_alu_dst_swiz(struct ir2_register *reg)
        return swiz;
 }
 
-static uint32_t reg_alu_src_swiz(struct ir2_register *reg)
+static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg)
 {
        uint32_t swiz = 0;
        int i;
 
-       assert((reg->flags & IR2_REG_EXPORT) == 0);
        assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
 
        DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle);
index c4b6c18e24c469af2736c1a44a2ee42d178bcc8a..ac2931266d4df47b91fd5266e30a240245087ad4 100644 (file)
 
 struct ir2_shader;
 
+#define REG_MASK 0xff
+
 struct ir2_shader_info {
        uint16_t sizedwords;
        int8_t   max_reg;   /* highest GPR # used by shader */
-       uint8_t  max_input_reg;
-       uint64_t regs_written;
 };
 
 struct ir2_register {
+       int16_t write_idx, write_idx2, read_idx, reg;
+       /* bitmask of variables on which this one depends
+        * XXX: use bitmask util?
+        */
+       uint32_t regmask[REG_MASK/32+1];
+};
+
+struct ir2_src_register {
        enum {
-               IR2_REG_CONST  = 0x1,
-               IR2_REG_EXPORT = 0x2,
+               IR2_REG_INPUT  = 0x1,
+               IR2_REG_CONST  = 0x2,
                IR2_REG_NEGATE = 0x4,
                IR2_REG_ABS    = 0x8,
        } flags;
@@ -51,6 +59,14 @@ struct ir2_register {
        char *swizzle;
 };
 
+struct ir2_dst_register {
+       enum {
+               IR2_REG_EXPORT = 0x1,
+       } flags;
+       int num;
+       char *swizzle;
+};
+
 enum ir2_pred {
        IR2_PRED_NONE = 0,
        IR2_PRED_EQ = 1,
@@ -59,14 +75,17 @@ enum ir2_pred {
 
 struct ir2_instruction {
        struct ir2_shader *shader;
+       unsigned idx;
        enum {
                IR2_FETCH,
-               IR2_ALU,
+               IR2_ALU_VECTOR,
+               IR2_ALU_SCALAR,
        } instr_type;
        enum ir2_pred pred;
        int sync;
-       unsigned regs_count;
-       struct ir2_register *regs[5];
+       unsigned src_reg_count;
+       struct ir2_dst_register dst_reg;
+       struct ir2_src_register src_reg[3];
        union {
                /* FETCH specific: */
                struct {
@@ -83,38 +102,25 @@ struct ir2_instruction {
                        uint32_t stride;
                        uint32_t offset;
                } fetch;
-               /* ALU specific: */
+               /* ALU-Vector specific: */
                struct {
-                       instr_vector_opc_t vector_opc;
-                       instr_scalar_opc_t scalar_opc;
-                       bool vector_clamp : 1;
-                       bool scalar_clamp : 1;
-               } alu;
-       };
-};
-
-struct ir2_cf {
-       struct ir2_shader *shader;
-       instr_cf_opc_t cf_type;
-
-       union {
-               /* EXEC/EXEC_END specific: */
-               struct {
-                       unsigned instrs_count;
-                       struct ir2_instruction *instrs[6];
-                       uint32_t addr, cnt, sequence;
-               } exec;
-               /* ALLOC specific: */
+                       instr_vector_opc_t opc;
+                       bool clamp;
+               } alu_vector;
+               /* ALU-Scalar specific: */
                struct {
-                       instr_alloc_type_t type;   /* SQ_POSITION or SQ_PARAMETER_PIXEL */
-                       int size;
-               } alloc;
+                       instr_scalar_opc_t opc;
+                       bool clamp;
+               } alu_scalar;
        };
 };
 
 struct ir2_shader {
-       unsigned cfs_count;
-       struct ir2_cf *cfs[0x56];
+       unsigned instr_count;
+       int max_reg;
+       struct ir2_register reg[REG_MASK+1];
+
+       struct ir2_instruction *instr[0x200];
        uint32_t heap[100 * 4096];
        unsigned heap_idx;
 
@@ -126,40 +132,41 @@ void ir2_shader_destroy(struct ir2_shader *shader);
 void * ir2_shader_assemble(struct ir2_shader *shader,
                struct ir2_shader_info *info);
 
-struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type);
-
-struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type);
+struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
+               int instr_type);
 
-struct ir2_register * ir2_reg_create(struct ir2_instruction *instr,
+struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
+               int num, const char *swizzle, int flags);
+struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
                int num, const char *swizzle, int flags);
 
 /* some helper fxns: */
 
-static inline struct ir2_cf *
-ir2_cf_create_alloc(struct ir2_shader *shader, instr_alloc_type_t type, int size)
+static inline struct ir2_instruction *
+ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop)
 {
-       struct ir2_cf *cf = ir2_cf_create(shader, ALLOC);
-       if (!cf)
-               return cf;
-       cf->alloc.type = type;
-       cf->alloc.size = size;
-       return cf;
+       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR);
+       if (!instr)
+               return instr;
+       instr->alu_vector.opc = vop;
+       return instr;
 }
+
 static inline struct ir2_instruction *
-ir2_instr_create_alu(struct ir2_cf *cf, instr_vector_opc_t vop, instr_scalar_opc_t sop)
+ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop)
 {
-       struct ir2_instruction *instr = ir2_instr_create(cf, IR2_ALU);
+       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR);
        if (!instr)
                return instr;
-       instr->alu.vector_opc = vop;
-       instr->alu.scalar_opc = sop;
+       instr->alu_scalar.opc = sop;
        return instr;
 }
+
 static inline struct ir2_instruction *
-ir2_instr_create_vtx_fetch(struct ir2_cf *cf, int ci, int cis,
+ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis,
                enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride)
 {
-       struct ir2_instruction *instr = ir2_instr_create(cf, IR2_FETCH);
+       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
        instr->fetch.opc = VTX_FETCH;
        instr->fetch.const_idx = ci;
        instr->fetch.const_idx_sel = cis;
@@ -169,9 +176,9 @@ ir2_instr_create_vtx_fetch(struct ir2_cf *cf, int ci, int cis,
        return instr;
 }
 static inline struct ir2_instruction *
-ir2_instr_create_tex_fetch(struct ir2_cf *cf, int ci)
+ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci)
 {
-       struct ir2_instruction *instr = ir2_instr_create(cf, IR2_FETCH);
+       struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
        instr->fetch.opc = TEX_FETCH;
        instr->fetch.const_idx = ci;
        return instr;