r600g: honour semantic index in fragment color exports
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
index 6dbca505cbe772528f6d998070a344c1422f1b41..37298ccdcf21655e36566cb1a4377589d617d292 100644 (file)
 #include "r600_shader.h"
 #include "r600d.h"
 
+#include "sb/sb_public.h"
+
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_dump.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include <stdio.h>
 #include <errno.h>
-#include <byteswap.h>
 
 /* CAYMAN notes 
 Why CAYMAN got loops for lots of instructions is explained here.
@@ -62,6 +64,26 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                                 struct r600_pipe_shader *pipeshader,
                                 struct r600_shader_key key);
 
+static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
+                           int size, unsigned comp_mask) {
+
+       if (!size)
+               return;
+
+       if (ps->num_arrays == ps->max_arrays) {
+               ps->max_arrays += 64;
+               ps->arrays = realloc(ps->arrays, ps->max_arrays *
+                                    sizeof(struct r600_shader_array));
+       }
+
+       int n = ps->num_arrays;
+       ++ps->num_arrays;
+
+       ps->arrays[n].comp_mask = comp_mask;
+       ps->arrays[n].gpr_start = start_gpr;
+       ps->arrays[n].gpr_count = size;
+}
+
 static unsigned tgsi_get_processor_type(const struct tgsi_token *tokens)
 {
        struct tgsi_parse_context parse;
@@ -118,6 +140,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
        int r, i;
        uint32_t *ptr;
        bool dump = r600_can_dump_shader(rctx->screen, tgsi_get_processor_type(sel->tokens));
+       unsigned use_sb = rctx->screen->debug_flags & DBG_SB;
+       unsigned sb_disasm = use_sb || (rctx->screen->debug_flags & DBG_SB_DISASM);
 
        shader->shader.bc.isa = rctx->isa;
 
@@ -134,18 +158,32 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
                R600_ERR("translation from TGSI failed !\n");
                return r;
        }
-       r = r600_bytecode_build(&shader->shader.bc);
-       if (r) {
-               R600_ERR("building bytecode failed !\n");
-               return r;
+
+       /* Check if the bytecode has already been built.  When using the llvm
+        * backend, r600_shader_from_tgsi() will take care of building the
+        * bytecode.
+        */
+       if (!shader->shader.bc.bytecode) {
+               r = r600_bytecode_build(&shader->shader.bc);
+               if (r) {
+                       R600_ERR("building bytecode failed !\n");
+                       return r;
+               }
        }
-       if (dump) {
+
+       if (dump && !sb_disasm) {
                fprintf(stderr, "--------------------------------------------------------------\n");
                r600_bytecode_disasm(&shader->shader.bc);
                fprintf(stderr, "______________________________________________________________\n");
+       } else if ((dump && sb_disasm) || use_sb) {
+               r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
+                                            dump, use_sb);
+               if (r) {
+                       R600_ERR("r600_sb_bytecode_process failed !\n");
+                       return r;
+               }
        }
 
-
        /* Store the shader in a buffer. */
        if (shader->bo == NULL) {
                shader->bo = (struct r600_resource*)
@@ -156,7 +194,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
                ptr = r600_buffer_mmap_sync_with_rings(rctx, shader->bo, PIPE_TRANSFER_WRITE);
                if (R600_BIG_ENDIAN) {
                        for (i = 0; i < shader->shader.bc.ndw; ++i) {
-                               ptr[i] = bswap_32(shader->shader.bc.bytecode[i]);
+                               ptr[i] = util_bswap32(shader->shader.bc.bytecode[i]);
                        }
                } else {
                        memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
@@ -253,401 +291,6 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
 static int tgsi_endloop(struct r600_shader_ctx *ctx);
 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
 
-/*
- * bytestream -> r600 shader
- *
- * These functions are used to transform the output of the LLVM backend into
- * struct r600_bytecode.
- */
-
-static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
-                               unsigned char * bytes,  unsigned num_bytes);
-
-#ifdef HAVE_OPENCL
-int r600_compute_shader_create(struct pipe_context * ctx,
-       LLVMModuleRef mod,  struct r600_bytecode * bytecode)
-{
-       struct r600_context *r600_ctx = (struct r600_context *)ctx;
-       unsigned char * bytes;
-       unsigned byte_count;
-       struct r600_shader_ctx shader_ctx;
-       bool dump = (r600_ctx->screen->debug_flags & DBG_CS) != 0;
-
-       r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
-       shader_ctx.bc = bytecode;
-       r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family,
-                          r600_ctx->screen->msaa_texture_support);
-       shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
-       shader_ctx.bc->isa = r600_ctx->isa;
-       r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
-       if (shader_ctx.bc->chip_class == CAYMAN) {
-               cm_bytecode_add_cf_end(shader_ctx.bc);
-       }
-       r600_bytecode_build(shader_ctx.bc);
-       if (dump) {
-               r600_bytecode_disasm(shader_ctx.bc);
-       }
-       free(bytes);
-       return 1;
-}
-
-#endif /* HAVE_OPENCL */
-
-static uint32_t i32_from_byte_stream(unsigned char * bytes,
-               unsigned * bytes_read)
-{
-       unsigned i;
-       uint32_t out = 0;
-       for (i = 0; i < 4; i++) {
-               out |= bytes[(*bytes_read)++] << (8 * i);
-       }
-       return out;
-}
-
-static unsigned r600_src_from_byte_stream(unsigned char * bytes,
-               unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
-{
-       unsigned i;
-       unsigned sel0, sel1;
-       sel0 = bytes[bytes_read++];
-       sel1 = bytes[bytes_read++];
-       alu->src[src_idx].sel = sel0 | (sel1 << 8);
-       alu->src[src_idx].chan = bytes[bytes_read++];
-       alu->src[src_idx].neg = bytes[bytes_read++];
-       alu->src[src_idx].abs = bytes[bytes_read++];
-       alu->src[src_idx].rel = bytes[bytes_read++];
-       alu->src[src_idx].kc_bank = bytes[bytes_read++];
-       for (i = 0; i < 4; i++) {
-               alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
-       }
-       return bytes_read;
-}
-
-static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
-                               unsigned char * bytes, unsigned bytes_read)
-{
-       unsigned src_idx, src_num;
-       struct r600_bytecode_alu alu;
-       unsigned src_use_sel[3];
-       const struct alu_op_info *alu_op;
-       unsigned src_sel[3] = {};
-       uint32_t word0, word1;
-
-       src_num = bytes[bytes_read++];
-
-       memset(&alu, 0, sizeof(alu));
-       for(src_idx = 0; src_idx < src_num; src_idx++) {
-               unsigned i;
-               src_use_sel[src_idx] = bytes[bytes_read++];
-               for (i = 0; i < 4; i++) {
-                       src_sel[src_idx] |= bytes[bytes_read++] << (i * 8);
-               }
-               for (i = 0; i < 4; i++) {
-                       alu.src[src_idx].value |= bytes[bytes_read++] << (i * 8);
-               }
-       }
-
-       word0 = i32_from_byte_stream(bytes, &bytes_read);
-       word1 = i32_from_byte_stream(bytes, &bytes_read);
-
-       switch(ctx->bc->chip_class) {
-       default:
-       case R600:
-               r600_bytecode_alu_read(ctx->bc, &alu, word0, word1);
-               break;
-       case R700:
-       case EVERGREEN:
-       case CAYMAN:
-               r700_bytecode_alu_read(ctx->bc, &alu, word0, word1);
-               break;
-       }
-
-       for(src_idx = 0; src_idx < src_num; src_idx++) {
-               if (src_use_sel[src_idx]) {
-                       unsigned sel = src_sel[src_idx];
-
-                       alu.src[src_idx].chan = sel & 3;
-                       sel >>= 2;
-
-                       if (sel>=512) { /* constant */
-                               sel -= 512;
-                               alu.src[src_idx].kc_bank = sel >> 12;
-                               alu.src[src_idx].sel = (sel & 4095) + 512;
-                       }
-                       else {
-                               alu.src[src_idx].sel = sel;
-                       }
-               }
-       }
-
-       alu_op = r600_isa_alu(alu.op);
-
-#if HAVE_LLVM < 0x0302
-       if ((alu_op->flags & AF_PRED) && alu_op->src_count == 2) {
-               alu.update_pred = 1;
-               alu.dst.write = 0;
-               alu.src[1].sel = V_SQ_ALU_SRC_0;
-               alu.src[1].chan = 0;
-               alu.last = 1;
-       }
-#endif
-
-       if (alu_op->flags & AF_MOVA) {
-               ctx->bc->ar_reg = alu.src[0].sel;
-               ctx->bc->ar_chan = alu.src[0].chan;
-               ctx->bc->ar_loaded = 0;
-               return bytes_read;
-       }
-
-       r600_bytecode_add_alu_type(ctx->bc, &alu, ctx->bc->cf_last->op);
-
-       /* XXX: Handle other KILL instructions */
-       if (alu_op->flags & AF_KILL) {
-               ctx->shader->uses_kill = 1;
-               /* XXX: This should be enforced in the LLVM backend. */
-               ctx->bc->force_add_cf = 1;
-       }
-       return bytes_read;
-}
-
-static void llvm_if(struct r600_shader_ctx *ctx)
-{
-       r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
-       fc_pushlevel(ctx, FC_IF);
-       callstack_push(ctx, FC_PUSH_VPM);
-}
-
-static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
-{
-       unsigned opcode = TGSI_OPCODE_BRK;
-       if (ctx->bc->chip_class == CAYMAN)
-               ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
-       else if (ctx->bc->chip_class >= EVERGREEN)
-               ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
-       else
-               ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
-       llvm_if(ctx);
-       tgsi_loop_brk_cont(ctx);
-       tgsi_endif(ctx);
-}
-
-static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
-                               unsigned char * bytes, unsigned bytes_read)
-{
-       struct r600_bytecode_alu alu;
-       unsigned inst;
-       memset(&alu, 0, sizeof(alu));
-       bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
-       inst = bytes[bytes_read++];
-       switch (inst) {
-       case 0: /* IF_PREDICATED */
-               llvm_if(ctx);
-               break;
-       case 1: /* ELSE */
-               tgsi_else(ctx);
-               break;
-       case 2: /* ENDIF */
-               tgsi_endif(ctx);
-               break;
-       case 3: /* BGNLOOP */
-               tgsi_bgnloop(ctx);
-               break;
-       case 4: /* ENDLOOP */
-               tgsi_endloop(ctx);
-               break;
-       case 5: /* PREDICATED_BREAK */
-               r600_break_from_byte_stream(ctx);
-               break;
-       case 6: /* CONTINUE */
-               {
-                       unsigned opcode = TGSI_OPCODE_CONT;
-                       if (ctx->bc->chip_class == CAYMAN) {
-                               ctx->inst_info =
-                                       &cm_shader_tgsi_instruction[opcode];
-                       } else if (ctx->bc->chip_class >= EVERGREEN) {
-                               ctx->inst_info =
-                                       &eg_shader_tgsi_instruction[opcode];
-                       } else {
-                               ctx->inst_info =
-                                       &r600_shader_tgsi_instruction[opcode];
-                       }
-                       tgsi_loop_brk_cont(ctx);
-               }
-               break;
-       }
-
-       return bytes_read;
-}
-
-static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
-                               unsigned char * bytes, unsigned bytes_read)
-{
-       struct r600_bytecode_tex tex;
-
-       uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
-       uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
-       uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
-
-       tex.op = r600_isa_fetch_by_opcode(ctx->bc->isa, G_SQ_TEX_WORD0_TEX_INST(word0));
-       tex.resource_id = G_SQ_TEX_WORD0_RESOURCE_ID(word0);
-       tex.src_gpr = G_SQ_TEX_WORD0_SRC_GPR(word0);
-       tex.src_rel = G_SQ_TEX_WORD0_SRC_REL(word0);
-       tex.dst_gpr = G_SQ_TEX_WORD1_DST_GPR(word1);
-       tex.dst_rel = G_SQ_TEX_WORD1_DST_REL(word1);
-       tex.dst_sel_x = G_SQ_TEX_WORD1_DST_SEL_X(word1);
-       tex.dst_sel_y = G_SQ_TEX_WORD1_DST_SEL_Y(word1);
-       tex.dst_sel_z = G_SQ_TEX_WORD1_DST_SEL_Z(word1);
-       tex.dst_sel_w = G_SQ_TEX_WORD1_DST_SEL_W(word1);
-       tex.lod_bias = G_SQ_TEX_WORD1_LOD_BIAS(word1);
-       tex.coord_type_x = G_SQ_TEX_WORD1_COORD_TYPE_X(word1);
-       tex.coord_type_y = G_SQ_TEX_WORD1_COORD_TYPE_Y(word1);
-       tex.coord_type_z = G_SQ_TEX_WORD1_COORD_TYPE_Z(word1);
-       tex.coord_type_w = G_SQ_TEX_WORD1_COORD_TYPE_W(word1);
-       tex.offset_x = G_SQ_TEX_WORD2_OFFSET_X(word2);
-       tex.offset_y = G_SQ_TEX_WORD2_OFFSET_Y(word2);
-       tex.offset_z = G_SQ_TEX_WORD2_OFFSET_Z(word2);
-       tex.sampler_id = G_SQ_TEX_WORD2_SAMPLER_ID(word2);
-       tex.src_sel_x = G_SQ_TEX_WORD2_SRC_SEL_X(word2);
-       tex.src_sel_y = G_SQ_TEX_WORD2_SRC_SEL_Y(word2);
-       tex.src_sel_z = G_SQ_TEX_WORD2_SRC_SEL_Z(word2);
-       tex.src_sel_w = G_SQ_TEX_WORD2_SRC_SEL_W(word2);
-       tex.offset_x <<= 1;
-       tex.offset_y <<= 1;
-       tex.offset_z <<= 1;
-
-       tex.inst_mod = 0;
-
-       r600_bytecode_add_tex(ctx->bc, &tex);
-
-       return bytes_read;
-}
-
-static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
-       unsigned char * bytes, unsigned bytes_read)
-{
-       struct r600_bytecode_vtx vtx;
-
-       uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
-        uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
-       uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
-
-       memset(&vtx, 0, sizeof(vtx));
-
-       /* WORD0 */
-       vtx.op = r600_isa_fetch_by_opcode(ctx->bc->isa,
-                       G_SQ_VTX_WORD0_VTX_INST(word0));
-       vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
-       vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
-       vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
-       vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
-       vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
-
-       /* WORD1 */
-       vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
-       vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
-       vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
-       vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
-       vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
-       vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
-       vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
-       vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
-       vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
-       vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
-
-       /* WORD 2*/
-       vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
-       vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
-
-       if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
-               fprintf(stderr, "Error adding vtx\n");
-       }
-
-       /* Use the Texture Cache for compute shaders*/
-       if (ctx->bc->chip_class >= EVERGREEN &&
-               ctx->bc->type == TGSI_PROCESSOR_COMPUTE) {
-               ctx->bc->cf_last->op = CF_OP_TEX;
-       }
-       return bytes_read;
-}
-
-static int r600_export_from_byte_stream(struct r600_shader_ctx *ctx,
-       unsigned char * bytes, unsigned bytes_read)
-{
-       uint32_t word0 = 0, word1 = 0;
-       struct r600_bytecode_output output;
-       memset(&output, 0, sizeof(struct r600_bytecode_output));
-       word0 = i32_from_byte_stream(bytes, &bytes_read);
-       word1 = i32_from_byte_stream(bytes, &bytes_read);
-       if (ctx->bc->chip_class >= EVERGREEN)
-               eg_bytecode_export_read(ctx->bc, &output, word0,word1);
-       else
-               r600_bytecode_export_read(ctx->bc, &output, word0,word1);
-       r600_bytecode_add_output(ctx->bc, &output);
-       return bytes_read;
-}
-
-static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
-                               unsigned char * bytes,  unsigned num_bytes)
-{
-       unsigned bytes_read = 0;
-       ctx->bc->nstack = bytes[bytes_read++];
-       unsigned i, byte;
-       while (bytes_read < num_bytes) {
-               char inst_type = bytes[bytes_read++];
-               switch (inst_type) {
-               case 0:
-                       bytes_read = r600_alu_from_byte_stream(ctx, bytes,
-                                                               bytes_read);
-                       break;
-               case 1:
-                       bytes_read = r600_tex_from_byte_stream(ctx, bytes,
-                                                               bytes_read);
-                       break;
-               case 2:
-                       bytes_read = r600_fc_from_byte_stream(ctx, bytes,
-                                                               bytes_read);
-                       break;
-               case 3:
-                       r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
-                       for (i = 0; i < 2; i++) {
-                               for (byte = 0 ; byte < 4; byte++) {
-                                       ctx->bc->cf_last->isa[i] |=
-                                       (bytes[bytes_read++] << (byte * 8));
-                               }
-                       }
-                       break;
-
-               case 4:
-                       bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
-                                                               bytes_read);
-                       break;
-               case 5:
-            bytes_read = r600_export_from_byte_stream(ctx, bytes,
-                                bytes_read);
-            break;
-               case 6: {
-                       int32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
-                       int32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
-
-                       r600_bytecode_add_cf(ctx->bc);
-                       ctx->bc->cf_last->op = r600_isa_cf_by_opcode(ctx->bc->isa, G_SQ_CF_ALU_WORD1_CF_INST(word1), 1);
-                       ctx->bc->cf_last->kcache[0].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK0(word0);
-                       ctx->bc->cf_last->kcache[0].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(word1);
-                       ctx->bc->cf_last->kcache[0].mode = G_SQ_CF_ALU_WORD0_KCACHE_MODE0(word0);
-                       ctx->bc->cf_last->kcache[1].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK1(word0);
-                       ctx->bc->cf_last->kcache[1].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(word1);
-                       ctx->bc->cf_last->kcache[1].mode = G_SQ_CF_ALU_WORD1_KCACHE_MODE1(word1);
-                       break;
-      }
-               default:
-                       /* XXX: Error here */
-                       break;
-               }
-       }
-}
-
-/* End bytestream -> r600 shader functions*/
-
 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 {
        struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
@@ -896,6 +539,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
        switch (d->Declaration.File) {
        case TGSI_FILE_INPUT:
                i = ctx->shader->ninput;
+                assert(i < Elements(ctx->shader->input));
                ctx->shader->ninput += count;
                ctx->shader->input[i].name = d->Semantic.Name;
                ctx->shader->input[i].sid = d->Semantic.Index;
@@ -927,6 +571,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
                break;
        case TGSI_FILE_OUTPUT:
                i = ctx->shader->noutput++;
+                assert(i < Elements(ctx->shader->output));
                ctx->shader->output[i].name = d->Semantic.Name;
                ctx->shader->output[i].sid = d->Semantic.Index;
                ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
@@ -955,8 +600,18 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
                        }
                }
                break;
-       case TGSI_FILE_CONSTANT:
        case TGSI_FILE_TEMPORARY:
+               if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+                       if (d->Array.ArrayID) {
+                               r600_add_gpr_array(ctx->shader,
+                                              ctx->file_offset[TGSI_FILE_TEMPORARY] +
+                                                                  d->Range.First,
+                                              d->Range.Last - d->Range.First + 1, 0x0F);
+                       }
+               }
+               break;
+
+       case TGSI_FILE_CONSTANT:
        case TGSI_FILE_SAMPLER:
        case TGSI_FILE_ADDRESS:
                break;
@@ -1247,6 +902,7 @@ static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
        return 0;
 }
 
+
 static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                                 struct r600_pipe_shader *pipeshader,
                                 struct r600_shader_key key)
@@ -1261,11 +917,10 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
        unsigned output_done, noutput;
        unsigned opcode;
        int i, j, k, r = 0;
-       int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
+       int next_pos_base = 60, next_param_base = 0;
        /* Declarations used by llvm code */
        bool use_llvm = false;
-       unsigned char * inst_bytes = NULL;
-       unsigned inst_byte_count = 0;
+       bool indirect_gprs;
 
 #ifdef R600_USE_LLVM
        use_llvm = !(rscreen->debug_flags & DBG_NO_LLVM);
@@ -1275,9 +930,11 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
        ctx.native_integers = true;
 
        r600_bytecode_init(ctx.bc, rscreen->chip_class, rscreen->family,
-                          rscreen->msaa_texture_support);
+                          rscreen->has_compressed_msaa_texturing);
        ctx.tokens = tokens;
        tgsi_scan_shader(tokens, &ctx.info);
+       shader->indirect_files = ctx.info.indirect_files;
+       indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
        tgsi_parse_init(&ctx.parse, tokens);
        ctx.type = ctx.parse.FullHeader.Processor.Processor;
        shader->processor_type = ctx.type;
@@ -1317,13 +974,6 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
        for (i = 0; i < TGSI_FILE_COUNT; i++) {
                ctx.file_offset[i] = 0;
        }
-       if (ctx.type == TGSI_PROCESSOR_VERTEX) {
-               ctx.file_offset[TGSI_FILE_INPUT] = 1;
-               r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
-       }
-       if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
-               ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
-       }
 
 #ifdef R600_USE_LLVM
        if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
@@ -1333,6 +983,15 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                use_llvm = 0;
        }
 #endif
+       if (ctx.type == TGSI_PROCESSOR_VERTEX) {
+               ctx.file_offset[TGSI_FILE_INPUT] = 1;
+               if (!use_llvm) {
+                       r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
+               }
+       }
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
+               ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
+       }
        ctx.use_llvm = use_llvm;
 
        if (use_llvm) {
@@ -1355,6 +1014,24 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                        ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
        ctx.temp_reg = ctx.bc->ar_reg + 1;
 
+       if (indirect_gprs) {
+               shader->max_arrays = 0;
+               shader->num_arrays = 0;
+
+               if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
+                       r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
+                                          ctx.file_offset[TGSI_FILE_OUTPUT] -
+                                          ctx.file_offset[TGSI_FILE_INPUT],
+                                          0x0F);
+               }
+               if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
+                       r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
+                                          ctx.file_offset[TGSI_FILE_TEMPORARY] -
+                                          ctx.file_offset[TGSI_FILE_OUTPUT],
+                                          0x0F);
+               }
+       }
+
        ctx.nliterals = 0;
        ctx.literals = NULL;
        shader->fs_write_all = FALSE;
@@ -1445,6 +1122,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                struct radeon_llvm_context radeon_llvm_ctx;
                LLVMModuleRef mod;
                bool dump = r600_can_dump_shader(rscreen, ctx.type);
+               boolean use_kill = false;
 
                memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
                radeon_llvm_ctx.type = ctx.type;
@@ -1459,10 +1137,9 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                radeon_llvm_ctx.clip_vertex = ctx.cv_output;
                radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
                mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
+               ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
 
-               if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
-                                     rscreen->family, dump)) {
-                       FREE(inst_bytes);
+               if (r600_llvm_compile(mod, rscreen->family, ctx.bc, &use_kill, dump)) {
                        radeon_llvm_dispose(&radeon_llvm_ctx);
                        use_llvm = 0;
                        fprintf(stderr, "R600 LLVM backend failed to compile "
@@ -1471,6 +1148,8 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                        ctx.file_offset[TGSI_FILE_OUTPUT] =
                                        ctx.file_offset[TGSI_FILE_INPUT];
                }
+               if (use_kill)
+                       ctx.shader->uses_kill = use_kill;
                radeon_llvm_dispose(&radeon_llvm_ctx);
        }
 #endif
@@ -1553,12 +1232,6 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
        /* Reset the temporary register counter. */
        ctx.max_driver_temp_used = 0;
 
-       /* Get instructions if we are using the LLVM backend. */
-       if (use_llvm) {
-               r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
-               FREE(inst_bytes);
-       }
-
        noutput = shader->noutput;
 
        if (ctx.clip_vertex_write) {
@@ -1767,13 +1440,13 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                case TGSI_PROCESSOR_FRAGMENT:
                        if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
                                /* never export more colors than the number of CBs */
-                               if (next_pixel_base && next_pixel_base >= key.nr_cbufs) {
+                               if (shader->output[i].sid >= key.nr_cbufs) {
                                        /* skip export */
                                        j--;
                                        continue;
                                }
                                output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
-                               output[j].array_base = next_pixel_base++;
+                               output[j].array_base = shader->output[i].sid;
                                output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
                                shader->nr_ps_color_exports++;
                                if (shader->fs_write_all && (rscreen->chip_class >= EVERGREEN)) {
@@ -1788,7 +1461,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                                                output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
                                                output[j].burst_count = 1;
                                                output[j].barrier = 1;
-                                               output[j].array_base = next_pixel_base++;
+                                               output[j].array_base = k;
                                                output[j].op = CF_OP_EXPORT;
                                                output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
                                                shader->nr_ps_color_exports++;
@@ -1859,7 +1532,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
        }
 
        /* add fake pixel export */
-       if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
+       if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
                memset(&output[j], 0, sizeof(struct r600_bytecode_output));
                output[j].gpr = 0;
                output[j].elem_size = 3;
@@ -1898,7 +1571,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen,
                }
        }
        /* add program end */
-       if (ctx.bc->chip_class == CAYMAN)
+       if (!use_llvm && ctx.bc->chip_class == CAYMAN)
                cm_bytecode_add_cf_end(ctx.bc);
 
        /* check GPR limit - we have 124 = 128 - 4
@@ -2420,7 +2093,7 @@ static int tgsi_kill(struct r600_shader_ctx *ctx)
 
                alu.src[0].sel = V_SQ_ALU_SRC_0;
 
-               if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
+               if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILL) {
                        alu.src[1].sel = V_SQ_ALU_SRC_1;
                        alu.src[1].neg = 1;
                } else {
@@ -4090,10 +3763,11 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
        unsigned src_gpr;
        int r, i, j;
        int opcode;
-       bool read_compressed_msaa = ctx->bc->msaa_texture_mode == MSAA_TEXTURE_COMPRESSED &&
+       bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
                                    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
                                    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
                                     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
+
        /* Texture fetch instructions can only use gprs as source.
         * Also they cannot negate the source or take the absolute value */
        const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
@@ -4739,6 +4413,26 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
                /* the array index is read from Z */
                tex.coord_type_z = 0;
 
+       /* mask unused source components */
+       if (opcode == FETCH_OP_SAMPLE) {
+               switch (inst->Texture.Texture) {
+               case TGSI_TEXTURE_2D:
+               case TGSI_TEXTURE_RECT:
+                       tex.src_sel_z = 7;
+                       tex.src_sel_w = 7;
+                       break;
+               case TGSI_TEXTURE_1D_ARRAY:
+                       tex.src_sel_y = 7;
+                       tex.src_sel_w = 7;
+                       break;
+               case TGSI_TEXTURE_1D:
+                       tex.src_sel_y = 7;
+                       tex.src_sel_z = 7;
+                       tex.src_sel_w = 7;
+                       break;
+               }
+       }
+
        r = r600_bytecode_add_tex(ctx->bc, &tex);
        if (r)
                return r;
@@ -5490,7 +5184,7 @@ static int tgsi_opdst(struct r600_shader_ctx *ctx)
        return 0;
 }
 
-static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
+static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
 {
        struct r600_bytecode_alu alu;
        int r;
@@ -5510,7 +5204,7 @@ static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
 
        alu.last = 1;
 
-       r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
+       r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
        if (r)
                return r;
        return 0;
@@ -5728,9 +5422,21 @@ static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
 }
 #endif
 
-static int tgsi_if(struct r600_shader_ctx *ctx)
+static int emit_if(struct r600_shader_ctx *ctx, int opcode)
 {
-       emit_logic_pred(ctx, ALU_OP2_PRED_SETNE_INT);
+       int alu_type = CF_OP_ALU_PUSH_BEFORE;
+
+       /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
+        * LOOP_STARTxxx for nested loops may put the branch stack into a state
+        * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
+        * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
+       if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
+               r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
+               ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
+               alu_type = CF_OP_ALU;
+       }
+
+       emit_logic_pred(ctx, opcode, alu_type);
 
        r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
 
@@ -5740,6 +5446,16 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
        return 0;
 }
 
+static int tgsi_if(struct r600_shader_ctx *ctx)
+{
+       return emit_if(ctx, ALU_OP2_PRED_SETNE);
+}
+
+static int tgsi_uif(struct r600_shader_ctx *ctx)
+{
+       return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
+}
+
 static int tgsi_else(struct r600_shader_ctx *ctx)
 {
        r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
@@ -5955,7 +5671,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_COS,       0, ALU_OP1_COS, tgsi_trig},
        {TGSI_OPCODE_DDX,       0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
        {TGSI_OPCODE_DDY,       0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
-       {TGSI_OPCODE_KILP,      0, ALU_OP2_KILLGT, tgsi_kill},  /* predicated kill */
+       {TGSI_OPCODE_KILL,      0, ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
        {TGSI_OPCODE_PK2H,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK2US,     0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK4B,      0, ALU_OP0_NOP, tgsi_unsupported},
@@ -5991,8 +5707,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_TXL,       0, FETCH_OP_SAMPLE_L, tgsi_tex},
        {TGSI_OPCODE_BRK,       0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
        {TGSI_OPCODE_IF,        0, ALU_OP0_NOP, tgsi_if},
-       /* gap */
-       {75,                    0, ALU_OP0_NOP, tgsi_unsupported},
+       {TGSI_OPCODE_UIF,       0, ALU_OP0_NOP, tgsi_uif},
        {76,                    0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_ELSE,      0, ALU_OP0_NOP, tgsi_else},
        {TGSI_OPCODE_ENDIF,     0, ALU_OP0_NOP, tgsi_endif},
@@ -6035,9 +5750,10 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
        {111,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_NRM4,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_CALLNZ,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_IFC,       0, ALU_OP0_NOP, tgsi_unsupported},
+       /* gap */
+       {114,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_BREAKC,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_KIL,       0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
+       {TGSI_OPCODE_KILL_IF,   0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
        {TGSI_OPCODE_END,       0, ALU_OP0_NOP, tgsi_end},  /* aka HALT */
        /* gap */
        {118,                   0, ALU_OP0_NOP, tgsi_unsupported},
@@ -6148,7 +5864,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_COS,       0, ALU_OP1_COS, tgsi_trig},
        {TGSI_OPCODE_DDX,       0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
        {TGSI_OPCODE_DDY,       0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
-       {TGSI_OPCODE_KILP,      0, ALU_OP2_KILLGT, tgsi_kill},  /* predicated kill */
+       {TGSI_OPCODE_KILL,      0, ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
        {TGSI_OPCODE_PK2H,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK2US,     0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK4B,      0, ALU_OP0_NOP, tgsi_unsupported},
@@ -6184,8 +5900,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_TXL,       0, FETCH_OP_SAMPLE_L, tgsi_tex},
        {TGSI_OPCODE_BRK,       0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
        {TGSI_OPCODE_IF,        0, ALU_OP0_NOP, tgsi_if},
-       /* gap */
-       {75,                    0, ALU_OP0_NOP, tgsi_unsupported},
+       {TGSI_OPCODE_UIF,       0, ALU_OP0_NOP, tgsi_uif},
        {76,                    0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_ELSE,      0, ALU_OP0_NOP, tgsi_else},
        {TGSI_OPCODE_ENDIF,     0, ALU_OP0_NOP, tgsi_endif},
@@ -6228,9 +5943,10 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
        {111,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_NRM4,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_CALLNZ,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_IFC,       0, ALU_OP0_NOP, tgsi_unsupported},
+       /* gap */
+       {114,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_BREAKC,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_KIL,       0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
+       {TGSI_OPCODE_KILL_IF,   0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
        {TGSI_OPCODE_END,       0, ALU_OP0_NOP, tgsi_end},  /* aka HALT */
        /* gap */
        {118,                   0, ALU_OP0_NOP, tgsi_unsupported},
@@ -6341,7 +6057,7 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_COS,       0, ALU_OP1_COS, cayman_trig},
        {TGSI_OPCODE_DDX,       0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
        {TGSI_OPCODE_DDY,       0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
-       {TGSI_OPCODE_KILP,      0, ALU_OP2_KILLGT, tgsi_kill},  /* predicated kill */
+       {TGSI_OPCODE_KILL,      0, ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
        {TGSI_OPCODE_PK2H,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK2US,     0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_PK4B,      0, ALU_OP0_NOP, tgsi_unsupported},
@@ -6377,8 +6093,7 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
        {TGSI_OPCODE_TXL,       0, FETCH_OP_SAMPLE_L, tgsi_tex},
        {TGSI_OPCODE_BRK,       0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
        {TGSI_OPCODE_IF,        0, ALU_OP0_NOP, tgsi_if},
-       /* gap */
-       {75,                    0, ALU_OP0_NOP, tgsi_unsupported},
+       {TGSI_OPCODE_UIF,       0, ALU_OP0_NOP, tgsi_uif},
        {76,                    0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_ELSE,      0, ALU_OP0_NOP, tgsi_else},
        {TGSI_OPCODE_ENDIF,     0, ALU_OP0_NOP, tgsi_endif},
@@ -6421,9 +6136,10 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
        {111,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_NRM4,      0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_CALLNZ,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_IFC,       0, ALU_OP0_NOP, tgsi_unsupported},
+       /* gap */
+       {114,                   0, ALU_OP0_NOP, tgsi_unsupported},
        {TGSI_OPCODE_BREAKC,    0, ALU_OP0_NOP, tgsi_unsupported},
-       {TGSI_OPCODE_KIL,       0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
+       {TGSI_OPCODE_KILL_IF,   0, ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
        {TGSI_OPCODE_END,       0, ALU_OP0_NOP, tgsi_end},  /* aka HALT */
        /* gap */
        {118,                   0, ALU_OP0_NOP, tgsi_unsupported},