+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_EXP_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.write = 1;
+ alu.dst.chan = 2;
+
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ }
+
+ /* result.w = 1.0;*/
+ if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_MOV;
+ alu.src[0].sel = V_SQ_ALU_SRC_1;
+ alu.src[0].chan = 0;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ return tgsi_helper_copy(ctx, inst);
+}
+
+static int tgsi_log(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ unsigned i;
+
+ /* result.x = floor(log2(|src|)); */
+ if (inst->Dst[0].Register.WriteMask & 1) {
+ if (ctx->bc->chip_class == CAYMAN) {
+ for (i = 0; i < 3; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = i;
+ if (i == 0)
+ alu.dst.write = 1;
+ if (i == 2)
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ alu.op = ALU_OP1_FLOOR;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 0;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 0;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
+ if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
+
+ if (ctx->bc->chip_class == CAYMAN) {
+ for (i = 0; i < 3; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = i;
+ if (i == 1)
+ alu.dst.write = 1;
+ if (i == 2)
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_FLOOR;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+
+ if (ctx->bc->chip_class == CAYMAN) {
+ for (i = 0; i < 3; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_EXP_IEEE;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = i;
+ if (i == 1)
+ alu.dst.write = 1;
+ if (i == 2)
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_EXP_IEEE;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ if (ctx->bc->chip_class == CAYMAN) {
+ for (i = 0; i < 3; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_RECIP_IEEE;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = i;
+ if (i == 1)
+ alu.dst.write = 1;
+ if (i == 2)
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_RECIP_IEEE;
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP2_MUL;
+
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.src[1].sel = ctx->temp_reg;
+ alu.src[1].chan = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ /* result.z = log2(|src|);*/
+ if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
+ if (ctx->bc->chip_class == CAYMAN) {
+ for (i = 0; i < 3; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ if (i == 2)
+ alu.dst.write = 1;
+ alu.dst.chan = i;
+ if (i == 2)
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ } else {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_LOG_IEEE;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+ r600_bytecode_src_set_abs(&alu.src[0]);
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.write = 1;
+ alu.dst.chan = 2;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ }
+
+ /* result.w = 1.0; */
+ if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP1_MOV;
+ alu.src[0].sel = V_SQ_ALU_SRC_1;
+ alu.src[0].chan = 0;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.chan = 3;
+ alu.dst.write = 1;
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ return tgsi_helper_copy(ctx, inst);
+}
+
+static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+ unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
+
+ assert(inst->Dst[0].Register.Index < 3);
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ARL:
+ alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
+ break;
+ case TGSI_OPCODE_ARR:
+ alu.op = ALU_OP1_FLT_TO_INT;
+ break;
+ case TGSI_OPCODE_UARL:
+ alu.op = ALU_OP1_MOV;
+ break;
+ default:
+ assert(0);
+ return -1;
+ }
+
+ for (i = 0; i <= lasti; ++i) {
+ if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+ continue;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ alu.last = i == lasti;
+ alu.dst.sel = reg;
+ alu.dst.chan = i;
+ alu.dst.write = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+ if (inst->Dst[0].Register.Index > 0)
+ ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
+ else
+ ctx->bc->ar_loaded = 0;
+
+ return 0;
+}
+static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int r;
+ int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+
+ switch (inst->Instruction.Opcode) {
+ case TGSI_OPCODE_ARL:
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_FLOOR;
+ alu.dst.sel = ctx->bc->ar_reg;
+ alu.dst.write = 1;
+ for (i = 0; i <= lasti; ++i) {
+ if (inst->Dst[0].Register.WriteMask & (1 << i)) {
+ alu.dst.chan = i;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ alu.last = i == lasti;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+ }
+
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_FLT_TO_INT;
+ alu.src[0].sel = ctx->bc->ar_reg;
+ alu.dst.sel = ctx->bc->ar_reg;
+ alu.dst.write = 1;
+ /* FLT_TO_INT is trans-only on r600/r700 */
+ alu.last = TRUE;
+ for (i = 0; i <= lasti; ++i) {
+ alu.dst.chan = i;
+ alu.src[0].chan = i;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+ break;
+ case TGSI_OPCODE_ARR:
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_FLT_TO_INT;
+ alu.dst.sel = ctx->bc->ar_reg;
+ alu.dst.write = 1;
+ /* FLT_TO_INT is trans-only on r600/r700 */
+ alu.last = TRUE;
+ for (i = 0; i <= lasti; ++i) {
+ if (inst->Dst[0].Register.WriteMask & (1 << i)) {
+ alu.dst.chan = i;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+ }
+ break;
+ case TGSI_OPCODE_UARL:
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_MOV;
+ alu.dst.sel = ctx->bc->ar_reg;
+ alu.dst.write = 1;
+ for (i = 0; i <= lasti; ++i) {
+ if (inst->Dst[0].Register.WriteMask & (1 << i)) {
+ alu.dst.chan = i;
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ alu.last = i == lasti;
+ if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+ return r;
+ }
+ }
+ break;
+ default:
+ assert(0);
+ return -1;
+ }
+
+ ctx->bc->ar_loaded = 0;
+ return 0;
+}
+
+static int tgsi_opdst(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int i, r = 0;
+
+ for (i = 0; i < 4; i++) {
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.op = ALU_OP2_MUL;
+ tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+
+ if (i == 0 || i == 3) {
+ alu.src[0].sel = V_SQ_ALU_SRC_1;
+ } else {
+ r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
+ }
+
+ if (i == 0 || i == 2) {
+ alu.src[1].sel = V_SQ_ALU_SRC_1;
+ } else {
+ r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+ }
+ if (i == 3)
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ return 0;
+}
+
+static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
+ struct r600_bytecode_alu_src *src)
+{
+ struct r600_bytecode_alu alu;
+ int r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = opcode;
+ alu.execute_mask = 1;
+ alu.update_pred = 1;
+
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.write = 1;
+ alu.dst.chan = 0;
+
+ alu.src[0] = *src;
+ alu.src[1].sel = V_SQ_ALU_SRC_0;
+ alu.src[1].chan = 0;
+
+ alu.last = 1;
+
+ r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
+ if (r)
+ return r;
+ return 0;
+}
+
+static int pops(struct r600_shader_ctx *ctx, int pops)
+{
+ unsigned force_pop = ctx->bc->force_add_cf;
+
+ if (!force_pop) {
+ int alu_pop = 3;
+ if (ctx->bc->cf_last) {
+ if (ctx->bc->cf_last->op == CF_OP_ALU)
+ alu_pop = 0;
+ else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
+ alu_pop = 1;
+ }
+ alu_pop += pops;
+ if (alu_pop == 1) {
+ ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
+ ctx->bc->force_add_cf = 1;
+ } else if (alu_pop == 2) {
+ ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
+ ctx->bc->force_add_cf = 1;
+ } else {
+ force_pop = 1;
+ }
+ }
+
+ if (force_pop) {
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
+ ctx->bc->cf_last->pop_count = pops;
+ ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
+ }
+
+ return 0;
+}
+
+static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
+ unsigned reason)
+{
+ struct r600_stack_info *stack = &ctx->bc->stack;
+ unsigned elements;
+ int entries;
+
+ unsigned entry_size = stack->entry_size;
+
+ elements = (stack->loop + stack->push_wqm ) * entry_size;
+ elements += stack->push;
+
+ switch (ctx->bc->chip_class) {
+ case R600:
+ case R700:
+ /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
+ * the stack must be reserved to hold the current active/continue
+ * masks */
+ if (reason == FC_PUSH_VPM) {
+ elements += 2;
+ }
+ break;
+
+ case CAYMAN:
+ /* r9xx: any stack operation on empty stack consumes 2 additional
+ * elements */
+ elements += 2;
+
+ /* fallthrough */
+ /* FIXME: do the two elements added above cover the cases for the
+ * r8xx+ below? */
+
+ case EVERGREEN:
+ /* r8xx+: 2 extra elements are not always required, but one extra
+ * element must be added for each of the following cases:
+ * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
+ * stack usage.
+ * (Currently we don't use ALU_ELSE_AFTER.)
+ * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
+ * PUSH instruction executed.
+ *
+ * NOTE: it seems we also need to reserve additional element in some
+ * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
+ * then STACK_SIZE should be 2 instead of 1 */
+ if (reason == FC_PUSH_VPM) {
+ elements += 1;
+ }
+ break;
+
+ default:
+ assert(0);
+ break;
+ }
+
+ /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
+ * for all chips, so we use 4 in the final formula, not the real entry_size
+ * for the chip */
+ entry_size = 4;
+
+ entries = (elements + (entry_size - 1)) / entry_size;
+
+ if (entries > stack->max_entries)
+ stack->max_entries = entries;
+}
+
+static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
+{
+ switch(reason) {
+ case FC_PUSH_VPM:
+ --ctx->bc->stack.push;
+ assert(ctx->bc->stack.push >= 0);
+ break;
+ case FC_PUSH_WQM:
+ --ctx->bc->stack.push_wqm;
+ assert(ctx->bc->stack.push_wqm >= 0);
+ break;
+ case FC_LOOP:
+ --ctx->bc->stack.loop;
+ assert(ctx->bc->stack.loop >= 0);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
+{
+ switch (reason) {
+ case FC_PUSH_VPM:
+ ++ctx->bc->stack.push;
+ break;
+ case FC_PUSH_WQM:
+ ++ctx->bc->stack.push_wqm;
+ case FC_LOOP:
+ ++ctx->bc->stack.loop;
+ break;
+ default:
+ assert(0);
+ }
+
+ callstack_update_max_depth(ctx, reason);
+}
+
+static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
+{
+ struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
+
+ sp->mid = realloc((void *)sp->mid,
+ sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
+ sp->mid[sp->num_mid] = ctx->bc->cf_last;
+ sp->num_mid++;
+}
+
+static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
+{
+ assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
+ ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
+ ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
+ ctx->bc->fc_sp++;
+}
+
+static void fc_poplevel(struct r600_shader_ctx *ctx)
+{
+ struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
+ free(sp->mid);
+ sp->mid = NULL;
+ sp->num_mid = 0;
+ sp->start = NULL;
+ sp->type = 0;
+ ctx->bc->fc_sp--;
+}
+
+#if 0
+static int emit_return(struct r600_shader_ctx *ctx)
+{
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
+ return 0;
+}
+
+static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
+{
+
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
+ ctx->bc->cf_last->pop_count = pops;
+ /* XXX work out offset */
+ return 0;
+}
+
+static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
+{
+ return 0;
+}
+
+static void emit_testflag(struct r600_shader_ctx *ctx)
+{
+
+}
+
+static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
+{
+ emit_testflag(ctx);
+ emit_jump_to_offset(ctx, 1, 4);
+ emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
+ pops(ctx, ifidx + 1);
+ emit_return(ctx);
+}
+
+static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
+{
+ emit_testflag(ctx);
+
+ r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
+ ctx->bc->cf_last->pop_count = 1;
+
+ fc_set_mid(ctx, fc_sp);
+
+ pops(ctx, 1);
+}
+#endif
+
+static int emit_if(struct r600_shader_ctx *ctx, int opcode,
+ struct r600_bytecode_alu_src *src)
+{
+ int alu_type = CF_OP_ALU_PUSH_BEFORE;
+
+ /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
+ * LOOP_STARTxxx for nested loops may put the branch stack into a state
+ * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
+ * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
+ if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
+ ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
+ alu_type = CF_OP_ALU;
+ }
+
+ emit_logic_pred(ctx, opcode, alu_type, src);
+
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
+
+ fc_pushlevel(ctx, FC_IF);
+
+ callstack_push(ctx, FC_PUSH_VPM);
+ return 0;
+}
+
+static int tgsi_if(struct r600_shader_ctx *ctx)
+{
+ struct r600_bytecode_alu_src alu_src;
+ r600_bytecode_src(&alu_src, &ctx->src[0], 0);
+
+ return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
+}
+
+static int tgsi_uif(struct r600_shader_ctx *ctx)
+{
+ struct r600_bytecode_alu_src alu_src;
+ r600_bytecode_src(&alu_src, &ctx->src[0], 0);
+ return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
+}
+
+static int tgsi_else(struct r600_shader_ctx *ctx)
+{
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
+ ctx->bc->cf_last->pop_count = 1;
+
+ fc_set_mid(ctx, ctx->bc->fc_sp - 1);
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
+ return 0;
+}
+
+static int tgsi_endif(struct r600_shader_ctx *ctx)
+{
+ pops(ctx, 1);
+ if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
+ R600_ERR("if/endif unbalanced in shader\n");
+ return -1;
+ }
+
+ if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
+ } else {
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
+ }
+ fc_poplevel(ctx);
+
+ callstack_pop(ctx, FC_PUSH_VPM);
+ return 0;
+}
+
+static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
+{
+ /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
+ * limited to 4096 iterations, like the other LOOP_* instructions. */
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
+
+ fc_pushlevel(ctx, FC_LOOP);
+
+ /* check stack depth */
+ callstack_push(ctx, FC_LOOP);
+ return 0;
+}
+
+static int tgsi_endloop(struct r600_shader_ctx *ctx)
+{
+ int i;
+
+ r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
+
+ if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
+ R600_ERR("loop/endloop in shader code are not paired.\n");
+ return -EINVAL;
+ }
+
+ /* fixup loop pointers - from r600isa
+ LOOP END points to CF after LOOP START,
+ LOOP START point to CF after LOOP END
+ BRK/CONT point to LOOP END CF
+ */
+ ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
+
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
+
+ for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
+ ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
+ }
+ /* XXX add LOOPRET support */
+ fc_poplevel(ctx);
+ callstack_pop(ctx, FC_LOOP);
+ return 0;
+}
+
+static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
+{
+ unsigned int fscp;
+
+ for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
+ {
+ if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
+ break;
+ }
+
+ if (fscp == 0) {
+ R600_ERR("Break not inside loop/endloop pair\n");
+ return -EINVAL;
+ }
+
+ r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
+
+ fc_set_mid(ctx, fscp - 1);
+
+ return 0;
+}
+
+static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
+ int r;
+
+ if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
+ emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
+
+ r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
+ if (!r) {
+ ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
+ if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
+ return emit_inc_ring_offset(ctx, stream, TRUE);
+ }
+ return r;
+}
+
+static int tgsi_umad(struct r600_shader_ctx *ctx)
+{
+ struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+ struct r600_bytecode_alu alu;
+ int i, j, r;
+ int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+
+ /* src0 * src1 */
+ for (i = 0; i < lasti + 1; i++) {
+ if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+ continue;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+ alu.dst.chan = i;
+ alu.dst.sel = ctx->temp_reg;
+ alu.dst.write = 1;
+
+ alu.op = ALU_OP2_MULLO_UINT;
+ for (j = 0; j < 2; j++) {
+ r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
+ }
+
+ alu.last = 1;
+ r = emit_mul_int_op(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+
+
+ for (i = 0; i < lasti + 1; i++) {
+ if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+ continue;