i965/fs: Move brw_wm_compile::fp to fs_visitor.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_emit.cpp
index a0db8aa27ca4bab02c559d44570bda928efb231b..4a1700b8fdb0d186bf84888bc1867b2cabe757ed 100644 (file)
@@ -34,6 +34,7 @@ extern "C" {
 } /* extern "C" */
 
 #include "brw_fs.h"
+#include "brw_cfg.h"
 #include "glsl/ir_print_visitor.h"
 
 void
@@ -41,6 +42,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
 {
    bool eot = inst->eot;
    struct brw_reg implied_header;
+   uint32_t msg_control;
 
    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
     * move, here's g1.
@@ -57,6 +59,18 @@ fs_visitor::generate_fb_write(fs_inst *inst)
                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+         if (inst->target > 0 &&
+            c->key.nr_color_regions > 1 &&
+            c->key.sample_alpha_to_coverage) {
+            /* Set "Source0 Alpha Present to RenderTarget" bit in message
+             * header.
+             */
+            brw_OR(p,
+                  vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
+                  vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+                  brw_imm_ud(0x1 << 11));
+         }
+
         if (inst->target > 0) {
            /* Set the render target index for choosing BLEND_STATE. */
            brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
@@ -77,12 +91,20 @@ fs_visitor::generate_fb_write(fs_inst *inst)
       implied_header = brw_null_reg();
    }
 
+   if (this->dual_src_output.file != BAD_FILE)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
+   else if (dispatch_width == 16)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   else
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+
    brw_pop_insn_state(p);
 
    brw_fb_WRITE(p,
-               c->dispatch_width,
+               dispatch_width,
                inst->base_mrf,
                implied_header,
+               msg_control,
                inst->target,
                inst->mlen,
                0,
@@ -111,7 +133,7 @@ fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
       deltas = brw_imm_v(0x11001100);
    }
 
-   if (c->dispatch_width == 16) {
+   if (dispatch_width == 16) {
       dst = vec16(dst);
    }
 
@@ -142,6 +164,29 @@ fs_visitor::generate_linterp(fs_inst *inst,
    }
 }
 
+void
+fs_visitor::generate_math1_gen7(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src0)
+{
+   assert(inst->mlen == 0);
+   brw_math(p, dst,
+           brw_math_function(inst->opcode),
+           0, src0,
+           BRW_MATH_DATA_VECTOR,
+           BRW_MATH_PRECISION_FULL);
+}
+
+void
+fs_visitor::generate_math2_gen7(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src0,
+                               struct brw_reg src1)
+{
+   assert(inst->mlen == 0);
+   brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
+}
+
 void
 fs_visitor::generate_math1_gen6(fs_inst *inst,
                                struct brw_reg dst,
@@ -154,18 +199,14 @@ fs_visitor::generate_math1_gen6(fs_inst *inst,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, dst,
            op,
-           inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-           BRW_MATH_SATURATE_NONE,
            0, src0,
            BRW_MATH_DATA_VECTOR,
            BRW_MATH_PRECISION_FULL);
 
-   if (c->dispatch_width == 16) {
+   if (dispatch_width == 16) {
       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
       brw_math(p, sechalf(dst),
               op,
-              inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-              BRW_MATH_SATURATE_NONE,
               0, sechalf(src0),
               BRW_MATH_DATA_VECTOR,
               BRW_MATH_PRECISION_FULL);
@@ -186,7 +227,7 @@ fs_visitor::generate_math2_gen6(fs_inst *inst,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math2(p, dst, op, src0, src1);
 
-   if (c->dispatch_width == 16) {
+   if (dispatch_width == 16) {
       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
@@ -205,18 +246,14 @@ fs_visitor::generate_math_gen4(fs_inst *inst,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, dst,
            op,
-           inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-           BRW_MATH_SATURATE_NONE,
            inst->base_mrf, src,
            BRW_MATH_DATA_VECTOR,
            BRW_MATH_PRECISION_FULL);
 
-   if (c->dispatch_width == 16) {
+   if (dispatch_width == 16) {
       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
       brw_math(p, sechalf(dst),
               op,
-              inst->saturate ? BRW_MATH_SATURATE_SATURATE :
-              BRW_MATH_SATURATE_NONE,
               inst->base_mrf + 1, sechalf(src),
               BRW_MATH_DATA_VECTOR,
               BRW_MATH_PRECISION_FULL);
@@ -231,13 +268,26 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    int msg_type = -1;
    int rlen = 4;
    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+   uint32_t return_format;
 
-   if (c->dispatch_width == 16)
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+      break;
+   default:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      break;
+   }
+
+   if (dispatch_width == 16)
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 
    if (intel->gen >= 5) {
       switch (inst->opcode) {
-      case FS_OPCODE_TEX:
+      case SHADER_OPCODE_TEX:
         if (inst->shadow_compare) {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
         } else {
@@ -251,21 +301,21 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
         }
         break;
-      case FS_OPCODE_TXL:
+      case SHADER_OPCODE_TXL:
         if (inst->shadow_compare) {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
         } else {
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
         }
         break;
-      case FS_OPCODE_TXS:
+      case SHADER_OPCODE_TXS:
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
         break;
-      case FS_OPCODE_TXD:
+      case SHADER_OPCODE_TXD:
         /* There is no sample_d_c message; comparisons are done manually */
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
         break;
-      case FS_OPCODE_TXF:
+      case SHADER_OPCODE_TXF:
         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
         break;
       default:
@@ -274,11 +324,11 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
       }
    } else {
       switch (inst->opcode) {
-      case FS_OPCODE_TEX:
+      case SHADER_OPCODE_TEX:
         /* Note that G45 and older determines shadow compare and dispatch width
          * from message length for most messages.
          */
-        assert(c->dispatch_width == 8);
+        assert(dispatch_width == 8);
         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
         if (inst->shadow_compare) {
            assert(inst->mlen == 6);
@@ -296,7 +346,7 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
            simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
         }
         break;
-      case FS_OPCODE_TXL:
+      case SHADER_OPCODE_TXL:
         if (inst->shadow_compare) {
            assert(inst->mlen == 6);
            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
@@ -306,17 +356,17 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
            simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
         }
         break;
-      case FS_OPCODE_TXD:
+      case SHADER_OPCODE_TXD:
         /* There is no sample_d_c message; comparisons are done manually */
         assert(inst->mlen == 7 || inst->mlen == 10);
         msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
         break;
-      case FS_OPCODE_TXF:
+      case SHADER_OPCODE_TXF:
         assert(inst->mlen == 9);
         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
         simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
         break;
-      case FS_OPCODE_TXS:
+      case SHADER_OPCODE_TXS:
         assert(inst->mlen == 3);
         msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
         simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
@@ -333,6 +383,27 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
       dst = vec16(dst);
    }
 
+   /* Load the message header if present.  If there's a texture offset,
+    * we need to set it up explicitly and load the offset bitfield.
+    * Otherwise, we can use an implied move from g0 to the first message reg.
+    */
+   if (inst->texture_offset) {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      /* Explicitly set up the message header by copying g0 to the MRF. */
+      brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
+                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+      /* Then set the offset bits in DWord 2. */
+      brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+                                     inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(inst->texture_offset));
+      brw_pop_insn_state(p);
+   } else if (inst->header_present) {
+      /* Set up an implied move from g0 to the MRF. */
+      src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   }
+
    brw_SAMPLE(p,
              retype(dst, BRW_REGISTER_TYPE_UW),
              inst->base_mrf,
@@ -343,9 +414,9 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
              msg_type,
              rlen,
              inst->mlen,
-             0,
              inst->header_present,
-             simd_mode);
+             simd_mode,
+             return_format);
 }
 
 
@@ -393,8 +464,13 @@ fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
    brw_ADD(p, dst, src0, negate(src1));
 }
 
+/* The negate_value boolean is used to negate the derivative computation for
+ * FBOs, since they place the origin at the upper left instead of the lower
+ * left.
+ */
 void
-fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
+fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                         bool negate_value)
 {
    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
                                 BRW_REGISTER_TYPE_F,
@@ -408,7 +484,10 @@ fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
                                 BRW_WIDTH_4,
                                 BRW_HORIZONTAL_STRIDE_0,
                                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
-   brw_ADD(p, dst, src0, negate(src1));
+   if (negate_value)
+      brw_ADD(p, dst, src1, negate(src0));
+   else
+      brw_ADD(p, dst, src0, negate(src1));
 }
 
 void
@@ -506,7 +585,9 @@ fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
 }
 
 void
-fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
+fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst,
+                                       struct brw_reg index,
+                                       struct brw_reg offset)
 {
    assert(inst->mlen != 0);
 
@@ -523,8 +604,16 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
    if (intel->gen == 4 && !intel->is_g4x)
       brw_MOV(p, brw_null_reg(), dst);
 
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+         index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.dw1.ud;
+
+   assert(offset.file == BRW_IMMEDIATE_VALUE &&
+         offset.type == BRW_REGISTER_TYPE_UD);
+   uint32_t read_offset = offset.dw1.ud;
+
    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
-                       inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
+                       read_offset, surf_index);
 
    if (intel->gen == 4 && !intel->is_g4x) {
       /* gen4 errata: destination from a send can't be used as a
@@ -535,6 +624,44 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
    }
 }
 
+
+/**
+ * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
+ * into the flags register (f0.0).
+ *
+ * Used only on Gen6 and above.
+ */
+void
+fs_visitor::generate_mov_dispatch_to_flags()
+{
+   struct brw_reg f0 = brw_flag_reg();
+   struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+
+   assert (intel->gen >= 6);
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, f0, g1);
+   brw_pop_insn_state(p);
+}
+
+
+static uint32_t brw_file_from_reg(fs_reg *reg)
+{
+   switch (reg->file) {
+   case ARF:
+      return BRW_ARCHITECTURE_REGISTER_FILE;
+   case GRF:
+      return BRW_GENERAL_REGISTER_FILE;
+   case MRF:
+      return BRW_MESSAGE_REGISTER_FILE;
+   case IMM:
+      return BRW_IMMEDIATE_VALUE;
+   default:
+      assert(!"not reached");
+      return BRW_GENERAL_REGISTER_FILE;
+   }
+}
+
 static struct brw_reg
 brw_reg_from_fs_reg(fs_reg *reg)
 {
@@ -545,9 +672,9 @@ brw_reg_from_fs_reg(fs_reg *reg)
    case ARF:
    case MRF:
       if (reg->smear == -1) {
-        brw_reg = brw_vec8_reg(reg->file, reg->reg, 0);
+        brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
       } else {
-        brw_reg = brw_vec1_reg(reg->file, reg->reg, reg->smear);
+        brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
       }
       brw_reg = retype(brw_reg, reg->type);
       if (reg->sechalf)
@@ -597,33 +724,59 @@ brw_reg_from_fs_reg(fs_reg *reg)
 void
 fs_visitor::generate_code()
 {
-   int last_native_inst = p->nr_insn;
+   int last_native_insn_offset = p->next_insn_offset;
    const char *last_annotation_string = NULL;
-   ir_instruction *last_annotation_ir = NULL;
-
-   int loop_stack_array_size = 16;
-   int loop_stack_depth = 0;
-   brw_instruction **loop_stack =
-      rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
-   int *if_depth_in_loop =
-      rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
-
+   const void *last_annotation_ir = NULL;
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
-            prog->Name, c->dispatch_width);
+      if (shader) {
+         printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+                prog->Name, dispatch_width);
+      } else {
+         printf("Native code for fragment program %d (%d-wide dispatch):\n",
+                fp->Base.Id, dispatch_width);
+      }
    }
 
+   cfg_t *cfg = NULL;
+   if (unlikely(INTEL_DEBUG & DEBUG_WM))
+      cfg = new(mem_ctx) cfg_t(this);
+
    foreach_list(node, &this->instructions) {
       fs_inst *inst = (fs_inst *)node;
       struct brw_reg src[3], dst;
 
       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+        foreach_list(node, &cfg->block_list) {
+           bblock_link *link = (bblock_link *)node;
+           bblock_t *block = link->block;
+
+           if (block->start == inst) {
+              printf("   START B%d", block->block_num);
+              foreach_list(predecessor_node, &block->parents) {
+                 bblock_link *predecessor_link =
+                    (bblock_link *)predecessor_node;
+                 bblock_t *predecessor_block = predecessor_link->block;
+                 printf(" <-B%d", predecessor_block->block_num);
+              }
+              printf("\n");
+           }
+        }
+
         if (last_annotation_ir != inst->ir) {
            last_annotation_ir = inst->ir;
            if (last_annotation_ir) {
               printf("   ");
-              last_annotation_ir->print();
+               if (shader)
+                  ((ir_instruction *)inst->ir)->print();
+               else {
+                  const prog_instruction *fpi;
+                  fpi = (const prog_instruction *)inst->ir;
+                  printf("%d: ", (int)(fpi - fp->Base.Instructions));
+                  _mesa_fprint_instruction_opt(stdout,
+                                               fpi,
+                                               0, PROG_PRINT_DEBUG, NULL);
+               }
               printf("\n");
            }
         }
@@ -636,15 +789,25 @@ fs_visitor::generate_code()
 
       for (unsigned int i = 0; i < 3; i++) {
         src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+
+        /* The accumulator result appears to get used for the
+         * conditional modifier generation.  When negating a UD
+         * value, there is a 33rd bit generated for the sign in the
+         * accumulator value, so now you can't check, for example,
+         * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+         */
+        assert(!inst->conditional_mod ||
+               inst->src[i].type != BRW_REGISTER_TYPE_UD ||
+               !inst->src[i].negate);
       }
       dst = brw_reg_from_fs_reg(&inst->dst);
 
       brw_set_conditionalmod(p, inst->conditional_mod);
-      brw_set_predicate_control(p, inst->predicated);
+      brw_set_predicate_control(p, inst->predicate);
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
 
-      if (inst->force_uncompressed || c->dispatch_width == 8) {
+      if (inst->force_uncompressed || dispatch_width == 8) {
         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       } else if (inst->force_sechalf) {
         brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
@@ -668,6 +831,20 @@ fs_visitor::generate_code()
         brw_set_acc_write_control(p, 0);
         break;
 
+      case BRW_OPCODE_MAD:
+        brw_set_access_mode(p, BRW_ALIGN_16);
+        if (dispatch_width == 16) {
+           brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+           brw_MAD(p, dst, src[0], src[1], src[2]);
+           brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+           brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
+           brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+        } else {
+           brw_MAD(p, dst, src[0], src[1], src[2]);
+        }
+        brw_set_access_mode(p, BRW_ALIGN_1);
+        break;
+
       case BRW_OPCODE_FRC:
         brw_FRC(p, dst, src[0]);
         break;
@@ -716,9 +893,8 @@ fs_visitor::generate_code()
            assert(intel->gen == 6);
            gen6_IF(p, inst->conditional_mod, src[0], src[1]);
         } else {
-           brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
+           brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
         }
-        if_depth_in_loop[loop_stack_depth]++;
         break;
 
       case BRW_OPCODE_ELSE:
@@ -726,59 +902,27 @@ fs_visitor::generate_code()
         break;
       case BRW_OPCODE_ENDIF:
         brw_ENDIF(p);
-        if_depth_in_loop[loop_stack_depth]--;
         break;
 
       case BRW_OPCODE_DO:
-        loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
-        if (loop_stack_array_size <= loop_stack_depth) {
-           loop_stack_array_size *= 2;
-           loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
-                                 loop_stack_array_size);
-           if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
-                                       loop_stack_array_size);
-        }
-        if_depth_in_loop[loop_stack_depth] = 0;
+        brw_DO(p, BRW_EXECUTE_8);
         break;
 
       case BRW_OPCODE_BREAK:
-        brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
+        brw_BREAK(p);
         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
         break;
       case BRW_OPCODE_CONTINUE:
         /* FINISHME: We need to write the loop instruction support still. */
         if (intel->gen >= 6)
-           gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
+           gen6_CONT(p);
         else
-           brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+           brw_CONT(p);
         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
         break;
 
-      case BRW_OPCODE_WHILE: {
-        struct brw_instruction *inst0, *inst1;
-        GLuint br = 1;
-
-        if (intel->gen >= 5)
-           br = 2;
-
-        assert(loop_stack_depth > 0);
-        loop_stack_depth--;
-        inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
-        if (intel->gen < 6) {
-           /* patch all the BREAK/CONT instructions from last BGNLOOP */
-           while (inst0 > loop_stack[loop_stack_depth]) {
-              inst0--;
-              if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-                  inst0->bits3.if_else.jump_count == 0) {
-                 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-           }
-              else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-                       inst0->bits3.if_else.jump_count == 0) {
-                 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-              }
-           }
-        }
-      }
+      case BRW_OPCODE_WHILE:
+        brw_WHILE(p);
         break;
 
       case SHADER_OPCODE_RCP:
@@ -788,7 +932,9 @@ fs_visitor::generate_code()
       case SHADER_OPCODE_LOG2:
       case SHADER_OPCODE_SIN:
       case SHADER_OPCODE_COS:
-        if (intel->gen >= 6) {
+        if (intel->gen >= 7) {
+           generate_math1_gen7(inst, dst, src[0]);
+        } else if (intel->gen == 6) {
            generate_math1_gen6(inst, dst, src[0]);
         } else {
            generate_math_gen4(inst, dst, src[0]);
@@ -797,7 +943,9 @@ fs_visitor::generate_code()
       case SHADER_OPCODE_INT_QUOTIENT:
       case SHADER_OPCODE_INT_REMAINDER:
       case SHADER_OPCODE_POW:
-        if (intel->gen >= 6) {
+        if (intel->gen >= 7) {
+           generate_math2_gen7(inst, dst, src[0], src[1]);
+        } else if (intel->gen == 6) {
            generate_math2_gen6(inst, dst, src[0], src[1]);
         } else {
            generate_math_gen4(inst, dst, src[0]);
@@ -815,12 +963,12 @@ fs_visitor::generate_code()
       case FS_OPCODE_LINTERP:
         generate_linterp(inst, dst, src);
         break;
-      case FS_OPCODE_TEX:
+      case SHADER_OPCODE_TEX:
       case FS_OPCODE_TXB:
-      case FS_OPCODE_TXD:
-      case FS_OPCODE_TXF:
-      case FS_OPCODE_TXL:
-      case FS_OPCODE_TXS:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXS:
         generate_tex(inst, dst, src[0]);
         break;
       case FS_OPCODE_DISCARD:
@@ -830,7 +978,11 @@ fs_visitor::generate_code()
         generate_ddx(inst, dst, src[0]);
         break;
       case FS_OPCODE_DDY:
-        generate_ddy(inst, dst, src[0]);
+         /* Make sure fp->UsesDFdy flag got set (otherwise there's no
+          * guarantee that c->key.render_to_fbo is set).
+          */
+         assert(fp->UsesDFdy);
+        generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
         break;
 
       case FS_OPCODE_SPILL:
@@ -842,16 +994,21 @@ fs_visitor::generate_code()
         break;
 
       case FS_OPCODE_PULL_CONSTANT_LOAD:
-        generate_pull_constant_load(inst, dst);
+        generate_pull_constant_load(inst, dst, src[0], src[1]);
         break;
 
       case FS_OPCODE_FB_WRITE:
         generate_fb_write(inst);
         break;
+
+      case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+         generate_mov_dispatch_to_flags();
+         break;
+
       default:
-        if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
+        if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
            _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
-                         brw_opcodes[inst->opcode].name);
+                         opcode_descs[inst->opcode].name);
         } else {
            _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
         }
@@ -859,28 +1016,33 @@ fs_visitor::generate_code()
       }
 
       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
-           if (0) {
-              printf("0x%08x 0x%08x 0x%08x 0x%08x ",
-                     ((uint32_t *)&p->store[i])[3],
-                     ((uint32_t *)&p->store[i])[2],
-                     ((uint32_t *)&p->store[i])[1],
-                     ((uint32_t *)&p->store[i])[0]);
+        brw_dump_compile(p, stdout,
+                         last_native_insn_offset, p->next_insn_offset);
+
+        foreach_list(node, &cfg->block_list) {
+           bblock_link *link = (bblock_link *)node;
+           bblock_t *block = link->block;
+
+           if (block->end == inst) {
+              printf("   END B%d", block->block_num);
+              foreach_list(successor_node, &block->children) {
+                 bblock_link *successor_link =
+                    (bblock_link *)successor_node;
+                 bblock_t *successor_block = successor_link->block;
+                 printf(" ->B%d", successor_block->block_num);
+              }
+              printf("\n");
            }
-           brw_disasm(stdout, &p->store[i], intel->gen);
         }
       }
 
-      last_native_inst = p->nr_insn;
+      last_native_insn_offset = p->next_insn_offset;
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
       printf("\n");
    }
 
-   ralloc_free(loop_stack);
-   ralloc_free(if_depth_in_loop);
-
    brw_set_uip_jip(p);
 
    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
@@ -889,15 +1051,6 @@ fs_visitor::generate_code()
     * case you're doing that.
     */
    if (0) {
-      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        for (unsigned int i = 0; i < p->nr_insn; i++) {
-           printf("0x%08x 0x%08x 0x%08x 0x%08x ",
-                  ((uint32_t *)&p->store[i])[3],
-                  ((uint32_t *)&p->store[i])[2],
-                  ((uint32_t *)&p->store[i])[1],
-                  ((uint32_t *)&p->store[i])[0]);
-           brw_disasm(stdout, &p->store[i], intel->gen);
-        }
-      }
+      brw_dump_compile(p, stdout, 0, p->next_insn_offset);
    }
 }