Merge remote branch 'origin/master' into gallium_draw_llvm
[mesa.git] / src / mesa / drivers / dri / i965 / brw_eu_emit.c
index 0ee208d7a4ce07b2248beb8059988c8ad26db81a..d2395dec288ab53624922fb644f879651a110aec 100644 (file)
@@ -55,7 +55,8 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
                          struct brw_reg dest )
 {
-   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
 
    insn->bits1.da1.dest_reg_file = dest.file;
@@ -101,8 +102,6 @@ static void brw_set_dest( struct brw_instruction *insn,
 static void brw_set_src0( struct brw_instruction *insn,
                           struct brw_reg reg )
 {
-   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
-
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
@@ -198,7 +197,7 @@ void brw_set_src1( struct brw_instruction *insn,
        * in the future:
        */
       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
         insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -241,7 +240,8 @@ void brw_set_src1( struct brw_instruction *insn,
 
 
 
-static void brw_set_math_message( struct brw_instruction *insn,
+static void brw_set_math_message( struct brw_context *brw,
+                                 struct brw_instruction *insn,
                                  GLuint msg_length,
                                  GLuint response_length,
                                  GLuint function,
@@ -250,20 +250,38 @@ static void brw_set_math_message( struct brw_instruction *insn,
                                  GLboolean saturate,
                                  GLuint dataType )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.math.function = function;
-   insn->bits3.math.int_type = integer_type;
-   insn->bits3.math.precision = low_precision;
-   insn->bits3.math.saturate = saturate;
-   insn->bits3.math.data_type = dataType;
-   insn->bits3.math.response_length = response_length;
-   insn->bits3.math.msg_length = msg_length;
-   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-   insn->bits3.math.end_of_thread = 0;
+   if (intel->is_ironlake) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
 }
 
-static void brw_set_urb_message( struct brw_instruction *insn,
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+                                struct brw_instruction *insn,
                                 GLboolean allocate,
                                 GLboolean used,
                                 GLuint msg_length,
@@ -273,21 +291,73 @@ static void brw_set_urb_message( struct brw_instruction *insn,
                                 GLuint offset,
                                 GLuint swizzle_control )
 {
-   brw_set_src1(insn, brw_imm_d(0));
+       brw_set_src1(insn, brw_imm_d(0));
+
+       insn->bits3.urb_igdng.opcode = 1;
+       insn->bits3.urb_igdng.offset = offset;
+       insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+       insn->bits3.urb_igdng.allocate = allocate;
+       insn->bits3.urb_igdng.used = used;
+       insn->bits3.urb_igdng.complete = complete;
+       insn->bits3.urb_igdng.header_present = 1;
+       insn->bits3.urb_igdng.response_length = response_length;
+       insn->bits3.urb_igdng.msg_length = msg_length;
+       insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
 
-   insn->bits3.urb.opcode = 0; /* ? */
-   insn->bits3.urb.offset = offset;
-   insn->bits3.urb.swizzle_control = swizzle_control;
-   insn->bits3.urb.allocate = allocate;
-   insn->bits3.urb.used = used;        /* ? */
-   insn->bits3.urb.complete = complete;
-   insn->bits3.urb.response_length = response_length;
-   insn->bits3.urb.msg_length = msg_length;
-   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+static void brw_set_urb_message( struct brw_context *brw,
+                                struct brw_instruction *insn,
+                                GLboolean allocate,
+                                GLboolean used,
+                                GLuint msg_length,
+                                GLuint response_length,
+                                GLboolean end_of_thread,
+                                GLboolean complete,
+                                GLuint offset,
+                                GLuint swizzle_control )
+{
+    struct intel_context *intel = &brw->intel;
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (intel->is_ironlake || intel->gen >= 6) {
+        insn->bits3.urb_igdng.opcode = 0;      /* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;     /* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+       if (intel->gen >= 6) {
+          /* For SNB, the SFID bits moved to the condmod bits, and
+           * EOT stayed in bits3 above.  Does the EOT bit setting
+           * below on Ironlake even do anything?
+           */
+          insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+       } else {
+          insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+          insn->bits2.send_igdng.end_of_thread = end_of_thread;
+       }
+    } else {
+        insn->bits3.urb.opcode = 0;    /* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;   /* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
 }
 
-static void brw_set_dp_write_message( struct brw_instruction *insn,
+static void brw_set_dp_write_message( struct brw_context *brw,
+                                     struct brw_instruction *insn,
                                      GLuint binding_table_index,
                                      GLuint msg_control,
                                      GLuint msg_type,
@@ -296,20 +366,36 @@ static void brw_set_dp_write_message( struct brw_instruction *insn,
                                      GLuint response_length,
                                      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_write.binding_table_index = binding_table_index;
-   insn->bits3.dp_write.msg_control = msg_control;
-   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-   insn->bits3.dp_write.msg_type = msg_type;
-   insn->bits3.dp_write.send_commit_msg = 0;
-   insn->bits3.dp_write.response_length = response_length;
-   insn->bits3.dp_write.msg_length = msg_length;
-   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+   if (intel->is_ironlake) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
 }
 
-static void brw_set_dp_read_message( struct brw_instruction *insn,
+static void brw_set_dp_read_message( struct brw_context *brw,
+                                     struct brw_instruction *insn,
                                      GLuint binding_table_index,
                                      GLuint msg_control,
                                      GLuint msg_type,
@@ -318,17 +404,32 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
                                      GLuint response_length,
                                      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
-   insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
-   insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
-   insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
-   insn->bits3.dp_read.response_length = response_length;  /*16:19*/
-   insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
-   insn->bits3.dp_read.pad1 = 0;  /*28:30*/
-   insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   if (intel->is_ironlake) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
@@ -338,11 +439,26 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint msg_type,
                                     GLuint response_length,
                                     GLuint msg_length,
-                                    GLboolean eot)
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
 {
+   struct intel_context *intel = &brw->intel;
+   assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_G4X(brw)) {
+   if (intel->is_ironlake) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (intel->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -377,8 +493,8 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    /* Reset this one-shot flag: 
     */
 
-   if (p->current->header.destreg__conditonalmod) {
-      p->current->header.destreg__conditonalmod = 0;   
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
    }
 
@@ -457,7 +573,7 @@ ALU2(DPH)
 ALU2(DP3)
 ALU2(DP2)
 ALU2(LINE)
-
+ALU2(PLN)
 
 
 
@@ -543,7 +659,12 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_ELSE(struct brw_compile *p, 
                                 struct brw_instruction *if_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (intel->is_ironlake)
+      br = 2;
 
    if (p->single_program_flow) {
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -570,8 +691,8 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    } else {
       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 
-      if_insn->bits3.if_else.jump_count = insn - if_insn;
-      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+      if_insn->bits3.if_else.pop_count = 0;
       if_insn->bits3.if_else.pad0 = 0;
    }
 
@@ -581,6 +702,12 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
               struct brw_instruction *patch_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
+   GLuint br = 1;
+
+   if (intel->is_ironlake)
+      br = 2; 
    if (p->single_program_flow) {
       /* In single program flow mode, there's no need to execute an ENDIF,
        * since we don't need to do any stack operations, and if we're executing
@@ -612,11 +739,11 @@ void brw_ENDIF(struct brw_compile *p,
         /* Automagically turn it into an IFF:
          */
         patch_insn->header.opcode = BRW_OPCODE_IFF;
-        patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+        patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
         patch_insn->bits3.if_else.pop_count = 0;
         patch_insn->bits3.if_else.pad0 = 0;
       } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-        patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+        patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
         patch_insn->bits3.if_else.pop_count = 1;
         patch_insn->bits3.if_else.pad0 = 0;
       } else {
@@ -689,7 +816,12 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
                                   struct brw_instruction *do_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (intel->is_ironlake)
+      br = 2;
 
    if (p->single_program_flow)
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -710,7 +842,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       insn->header.execution_size = do_insn->header.execution_size;
 
       assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn + 1;
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
       insn->bits3.if_else.pop_count = 0;
       insn->bits3.if_else.pad0 = 0;
    }
@@ -728,12 +860,17 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 void brw_land_fwd_jump(struct brw_compile *p, 
                       struct brw_instruction *jmp_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (intel->is_ironlake)
+       jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
-   jmp_insn->bits3.ud = (landing - jmp_insn) - 1; 
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
 
 
@@ -750,7 +887,7 @@ void brw_CMP(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
-   insn->header.destreg__conditonalmod = conditional;
+   insn->header.destreg__conditionalmod = conditional;
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
@@ -786,25 +923,40 @@ void brw_math( struct brw_compile *p,
               GLuint data_type,
               GLuint precision )
 {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
-   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+   struct intel_context *intel = &p->brw->intel;
 
-   /* Example code doesn't set predicate_control for send
-    * instructions.
-    */
-   insn->header.predicate_control = 0; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   if (intel->gen >= 6) {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
-                       msg_length, response_length, 
-                       function,
-                       BRW_MATH_INTEGER_UNSIGNED,
-                       precision,
-                       saturate,
-                       data_type);
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+      /* Example code doesn't set predicate_control for send
+       * instructions.
+       */
+      insn->header.predicate_control = 0;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_math_message(p->brw,
+                          insn,
+                          msg_length, response_length,
+                          function,
+                          BRW_MATH_INTEGER_UNSIGNED,
+                          precision,
+                          saturate,
+                          data_type);
+   }
 }
 
 /**
@@ -830,11 +982,12 @@ void brw_math_16( struct brw_compile *p,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+                       insn, 
                        msg_length, response_length, 
                        function,
                        BRW_MATH_INTEGER_UNSIGNED,
@@ -846,11 +999,12 @@ void brw_math_16( struct brw_compile *p,
     */
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
    brw_set_dest(insn, offset(dest,1));
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw, 
+                       insn, 
                        msg_length, response_length, 
                        function,
                        BRW_MATH_INTEGER_UNSIGNED,
@@ -892,12 +1046,13 @@ void brw_dp_WRITE_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
-      brw_set_dp_write_message(insn,
+      brw_set_dp_write_message(p->brw,
+                              insn,
                               255, /* binding table index (255=stateless) */
                               BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
                               BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
@@ -937,12 +1092,13 @@ void brw_dp_READ_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);        /* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+                             insn,
                              255, /* binding table index (255=stateless) */
                              3,  /* msg_control (3 means 4 Owords) */
                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -990,7 +1146,7 @@ void brw_dp_READ_4( struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
   
       /* cast dest to a uword[8] vector */
@@ -999,7 +1155,8 @@ void brw_dp_READ_4( struct brw_compile *p,
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+                             insn,
                              bind_table_index,
                              0,  /* msg_control (0 means 1 Oword) */
                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1063,14 +1220,15 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
       /*insn->header.access_mode = BRW_ALIGN_16;*/
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+                             insn,
                              bind_table_index,
                              oword,  /* 0 = lower Oword, 1 = upper Oword */
                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1096,11 +1254,12 @@ void brw_fb_WRITE(struct brw_compile *p,
    
    insn->header.predicate_control = 0; /* XXX */
    insn->header.compression_control = BRW_COMPRESSION_NONE; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
   
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
-   brw_set_dp_write_message(insn,
+   brw_set_dp_write_message(p->brw,
+                           insn,
                            binding_table_index,
                            BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
                            BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
@@ -1126,12 +1285,14 @@ void brw_SAMPLE(struct brw_compile *p,
                GLuint msg_type,
                GLuint response_length,
                GLuint msg_length,
-               GLboolean eot)
+               GLboolean eot,
+               GLuint header_present,
+               GLuint simd_mode)
 {
    GLboolean need_stall = 0;
-   
+
    if (writemask == 0) {
-      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+      /*printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1163,11 +1324,17 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
         need_stall = 1;
-         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
+        GLboolean dispatch_16 = GL_FALSE;
+
         struct brw_reg m1 = brw_message_reg(msg_reg_nr);
-        
+
+        guess_execution_size(p->current, dest);
+        if (p->current->header.execution_size == BRW_EXECUTE_16)
+           dispatch_16 = GL_TRUE;
+
         newmask = ~newmask & WRITEMASK_XYZW;
 
         brw_push_insn_state(p);
@@ -1182,7 +1349,13 @@ void brw_SAMPLE(struct brw_compile *p,
 
         src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
         dest = offset(dest, dst_offset);
-        response_length = len * 2;
+
+        /* For 16-wide dispatch, masked channels are skipped in the
+         * response.  For 8-wide, masked channels still take up slots,
+         * and are just not written to.
+         */
+        if (dispatch_16)
+           response_length = len * 2;
       }
    }
 
@@ -1191,7 +1364,7 @@ void brw_SAMPLE(struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src0);
@@ -1201,7 +1374,9 @@ void brw_SAMPLE(struct brw_compile *p,
                              msg_type,
                              response_length, 
                              msg_length,
-                             eot);
+                             eot,
+                             header_present,
+                             simd_mode);
    }
 
    if (need_stall) {
@@ -1234,7 +1409,18 @@ void brw_urb_WRITE(struct brw_compile *p,
                   GLuint offset,
                   GLuint swizzle)
 {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+
+   /* Sandybridge doesn't have the implied move for SENDs,
+    * and the first message register index comes from src0.
+    */
+   if (intel->gen >= 6) {
+      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+      src0 = brw_message_reg(msg_reg_nr);
+   }
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
 
    assert(msg_length < BRW_MAX_MRF);
 
@@ -1242,9 +1428,11 @@ void brw_urb_WRITE(struct brw_compile *p,
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   if (intel->gen < 6)
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_urb_message(insn,
+   brw_set_urb_message(p->brw,
+                      insn,
                       allocate,
                       used,
                       msg_length,
@@ -1255,3 +1443,37 @@ void brw_urb_WRITE(struct brw_compile *p,
                       swizzle);
 }
 
+void brw_ff_sync(struct brw_compile *p,
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLboolean allocate,
+                  GLboolean used,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot,
+                  GLboolean writes_complete,
+                  GLuint offset,
+                  GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+                      insn,
+                      allocate,
+                      used,
+                      msg_length,
+                      response_length, 
+                      eot, 
+                      writes_complete, 
+                      offset,
+                      swizzle);
+}