src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37
  38
  39
  40 /***********************************************************************
  41  * Internal helper for constructing instructions
  42  */
  43
  44 static void guess_execution_size(struct brw_compile *p,
  45                                  struct brw_instruction *insn,
  46                                  struct brw_reg reg)
  47 {
  48    if (reg.width == BRW_WIDTH_8 && p->compressed)
  49       insn->header.execution_size = BRW_EXECUTE_16;
  50    else
  51       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  52 }
  53
  54
  55 /**
  56  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  57  * registers, implicitly moving the operand to a message register.
  58  *
  59  * On Sandybridge, this is no longer the case.  This function performs the
  60  * explicit move; it should be called before emitting a SEND instruction.
  61  */
  62 static void
  63 gen6_resolve_implied_move(struct brw_compile *p,
  64                           struct brw_reg *src,
  65                           GLuint msg_reg_nr)
  66 {
  67    struct intel_context *intel = &p->brw->intel;
  68    if (intel->gen != 6)
  69       return;
  70
  71    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  72       brw_push_insn_state(p);
  73       brw_set_mask_control(p, BRW_MASK_DISABLE);
  74       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  75       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  76               retype(*src, BRW_REGISTER_TYPE_UD));
  77       brw_pop_insn_state(p);
  78    }
  79    *src = brw_message_reg(msg_reg_nr);
  80 }
  81
  82
  83 static void brw_set_dest(struct brw_compile *p,
  84                          struct brw_instruction *insn,
  85                          struct brw_reg dest)
  86 {
  87    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
  88        dest.file != BRW_MESSAGE_REGISTER_FILE)
  89       assert(dest.nr < 128);
  90
  91    insn->bits1.da1.dest_reg_file = dest.file;
  92    insn->bits1.da1.dest_reg_type = dest.type;
  93    insn->bits1.da1.dest_address_mode = dest.address_mode;
  94
  95    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
  96       insn->bits1.da1.dest_reg_nr = dest.nr;
  97
  98       if (insn->header.access_mode == BRW_ALIGN_1) {
  99          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 100          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 101             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 102          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 103       }
 104       else {
 105          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 106          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 107          /* even ignored in da16, still need to set as '01' */
 108          insn->bits1.da16.dest_horiz_stride = 1;
 109       }
 110    }
 111    else {
 112       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 113
 114       /* These are different sizes in align1 vs align16:
 115        */
 116       if (insn->header.access_mode == BRW_ALIGN_1) {
 117          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 118          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 119             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 120          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 121       }
 122       else {
 123          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 124          /* even ignored in da16, still need to set as '01' */
 125          insn->bits1.ia16.dest_horiz_stride = 1;
 126       }
 127    }
 128
 129    /* NEW: Set the execution size based on dest.width and
 130     * insn->compression_control:
 131     */
 132    guess_execution_size(p, insn, dest);
 133 }
 134
 135 extern int reg_type_size[];
 136
 137 static void
 138 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 139 {
 140    int hstride_for_reg[] = {0, 1, 2, 4};
 141    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 142    int width_for_reg[] = {1, 2, 4, 8, 16};
 143    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 144    int width, hstride, vstride, execsize;
 145
 146    if (reg.file == BRW_IMMEDIATE_VALUE) {
 147       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 148        * mean the destination has to be 128-bit aligned and the
 149        * destination horiz stride has to be a word.
 150        */
 151       if (reg.type == BRW_REGISTER_TYPE_V) {
 152          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 153                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 154       }
 155
 156       return;
 157    }
 158
 159    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 160        reg.file == BRW_ARF_NULL)
 161       return;
 162
 163    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 164    hstride = hstride_for_reg[reg.hstride];
 165
 166    if (reg.vstride == 0xf) {
 167       vstride = -1;
 168    } else {
 169       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 170       vstride = vstride_for_reg[reg.vstride];
 171    }
 172
 173    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 174    width = width_for_reg[reg.width];
 175
 176    assert(insn->header.execution_size >= 0 &&
 177           insn->header.execution_size < Elements(execsize_for_reg));
 178    execsize = execsize_for_reg[insn->header.execution_size];
 179
 180    /* Restrictions from 3.3.10: Register Region Restrictions. */
 181    /* 3. */
 182    assert(execsize >= width);
 183
 184    /* 4. */
 185    if (execsize == width && hstride != 0) {
 186       assert(vstride == -1 || vstride == width * hstride);
 187    }
 188
 189    /* 5. */
 190    if (execsize == width && hstride == 0) {
 191       /* no restriction on vstride. */
 192    }
 193
 194    /* 6. */
 195    if (width == 1) {
 196       assert(hstride == 0);
 197    }
 198
 199    /* 7. */
 200    if (execsize == 1 && width == 1) {
 201       assert(hstride == 0);
 202       assert(vstride == 0);
 203    }
 204
 205    /* 8. */
 206    if (vstride == 0 && hstride == 0) {
 207       assert(width == 1);
 208    }
 209
 210    /* 10. Check destination issues. */
 211 }
 212
 213 static void brw_set_src0( struct brw_instruction *insn,
 214                           struct brw_reg reg )
 215 {
 216    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 217       assert(reg.nr < 128);
 218
 219    validate_reg(insn, reg);
 220
 221    insn->bits1.da1.src0_reg_file = reg.file;
 222    insn->bits1.da1.src0_reg_type = reg.type;
 223    insn->bits2.da1.src0_abs = reg.abs;
 224    insn->bits2.da1.src0_negate = reg.negate;
 225    insn->bits2.da1.src0_address_mode = reg.address_mode;
 226
 227    if (reg.file == BRW_IMMEDIATE_VALUE) {
 228       insn->bits3.ud = reg.dw1.ud;
 229
 230       /* Required to set some fields in src1 as well:
 231        */
 232       insn->bits1.da1.src1_reg_file = 0; /* arf */
 233       insn->bits1.da1.src1_reg_type = reg.type;
 234    }
 235    else
 236    {
 237       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 238          if (insn->header.access_mode == BRW_ALIGN_1) {
 239             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 240             insn->bits2.da1.src0_reg_nr = reg.nr;
 241          }
 242          else {
 243             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 244             insn->bits2.da16.src0_reg_nr = reg.nr;
 245          }
 246       }
 247       else {
 248          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 249
 250          if (insn->header.access_mode == BRW_ALIGN_1) {
 251             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 252          }
 253          else {
 254             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 255          }
 256       }
 257
 258       if (insn->header.access_mode == BRW_ALIGN_1) {
 259          if (reg.width == BRW_WIDTH_1 &&
 260              insn->header.execution_size == BRW_EXECUTE_1) {
 261             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 262             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 263             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 264          }
 265          else {
 266             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 267             insn->bits2.da1.src0_width = reg.width;
 268             insn->bits2.da1.src0_vert_stride = reg.vstride;
 269          }
 270       }
 271       else {
 272          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 273          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 274          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 275          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 276
 277          /* This is an oddity of the fact we're using the same
 278           * descriptions for registers in align_16 as align_1:
 279           */
 280          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 281             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 282          else
 283             insn->bits2.da16.src0_vert_stride = reg.vstride;
 284       }
 285    }
 286 }
 287
 288
 289 void brw_set_src1( struct brw_instruction *insn,
 290                    struct brw_reg reg )
 291 {
 292    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 293
 294    assert(reg.nr < 128);
 295
 296    validate_reg(insn, reg);
 297
 298    insn->bits1.da1.src1_reg_file = reg.file;
 299    insn->bits1.da1.src1_reg_type = reg.type;
 300    insn->bits3.da1.src1_abs = reg.abs;
 301    insn->bits3.da1.src1_negate = reg.negate;
 302
 303    /* Only src1 can be immediate in two-argument instructions.
 304     */
 305    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 306
 307    if (reg.file == BRW_IMMEDIATE_VALUE) {
 308       insn->bits3.ud = reg.dw1.ud;
 309    }
 310    else {
 311       /* This is a hardware restriction, which may or may not be lifted
 312        * in the future:
 313        */
 314       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 315       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 316
 317       if (insn->header.access_mode == BRW_ALIGN_1) {
 318          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 319          insn->bits3.da1.src1_reg_nr = reg.nr;
 320       }
 321       else {
 322          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 323          insn->bits3.da16.src1_reg_nr = reg.nr;
 324       }
 325
 326       if (insn->header.access_mode == BRW_ALIGN_1) {
 327          if (reg.width == BRW_WIDTH_1 &&
 328              insn->header.execution_size == BRW_EXECUTE_1) {
 329             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 330             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 331             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 332          }
 333          else {
 334             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 335             insn->bits3.da1.src1_width = reg.width;
 336             insn->bits3.da1.src1_vert_stride = reg.vstride;
 337          }
 338       }
 339       else {
 340          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 341          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 342          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 343          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 344
 345          /* This is an oddity of the fact we're using the same
 346           * descriptions for registers in align_16 as align_1:
 347           */
 348          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 349             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 350          else
 351             insn->bits3.da16.src1_vert_stride = reg.vstride;
 352       }
 353    }
 354 }
 355
 356
 357
 358 static void brw_set_math_message( struct brw_context *brw,
 359                                   struct brw_instruction *insn,
 360                                   GLuint msg_length,
 361                                   GLuint response_length,
 362                                   GLuint function,
 363                                   GLuint integer_type,
 364                                   GLboolean low_precision,
 365                                   GLboolean saturate,
 366                                   GLuint dataType )
 367 {
 368    struct intel_context *intel = &brw->intel;
 369    brw_set_src1(insn, brw_imm_d(0));
 370
 371    if (intel->gen == 5) {
 372        insn->bits3.math_gen5.function = function;
 373        insn->bits3.math_gen5.int_type = integer_type;
 374        insn->bits3.math_gen5.precision = low_precision;
 375        insn->bits3.math_gen5.saturate = saturate;
 376        insn->bits3.math_gen5.data_type = dataType;
 377        insn->bits3.math_gen5.snapshot = 0;
 378        insn->bits3.math_gen5.header_present = 0;
 379        insn->bits3.math_gen5.response_length = response_length;
 380        insn->bits3.math_gen5.msg_length = msg_length;
 381        insn->bits3.math_gen5.end_of_thread = 0;
 382        insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
 383        insn->bits2.send_gen5.end_of_thread = 0;
 384    } else {
 385        insn->bits3.math.function = function;
 386        insn->bits3.math.int_type = integer_type;
 387        insn->bits3.math.precision = low_precision;
 388        insn->bits3.math.saturate = saturate;
 389        insn->bits3.math.data_type = dataType;
 390        insn->bits3.math.response_length = response_length;
 391        insn->bits3.math.msg_length = msg_length;
 392        insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
 393        insn->bits3.math.end_of_thread = 0;
 394    }
 395 }
 396
 397
 398 static void brw_set_ff_sync_message(struct brw_context *brw,
 399                                     struct brw_instruction *insn,
 400                                     GLboolean allocate,
 401                                     GLuint response_length,
 402                                     GLboolean end_of_thread)
 403 {
 404         struct intel_context *intel = &brw->intel;
 405         brw_set_src1(insn, brw_imm_d(0));
 406
 407         insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 408         insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 409         insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 410         insn->bits3.urb_gen5.allocate = allocate;
 411         insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 412         insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 413         insn->bits3.urb_gen5.header_present = 1;
 414         insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
 415         insn->bits3.urb_gen5.msg_length = 1;
 416         insn->bits3.urb_gen5.end_of_thread = end_of_thread;
 417         if (intel->gen >= 6) {
 418            insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
 419         } else {
 420            insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
 421            insn->bits2.send_gen5.end_of_thread = end_of_thread;
 422         }
 423 }
 424
 425 static void brw_set_urb_message( struct brw_context *brw,
 426                                  struct brw_instruction *insn,
 427                                  GLboolean allocate,
 428                                  GLboolean used,
 429                                  GLuint msg_length,
 430                                  GLuint response_length,
 431                                  GLboolean end_of_thread,
 432                                  GLboolean complete,
 433                                  GLuint offset,
 434                                  GLuint swizzle_control )
 435 {
 436     struct intel_context *intel = &brw->intel;
 437     brw_set_src1(insn, brw_imm_d(0));
 438
 439     if (intel->gen >= 5) {
 440         insn->bits3.urb_gen5.opcode = 0;        /* ? */
 441         insn->bits3.urb_gen5.offset = offset;
 442         insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 443         insn->bits3.urb_gen5.allocate = allocate;
 444         insn->bits3.urb_gen5.used = used;       /* ? */
 445         insn->bits3.urb_gen5.complete = complete;
 446         insn->bits3.urb_gen5.header_present = 1;
 447         insn->bits3.urb_gen5.response_length = response_length;
 448         insn->bits3.urb_gen5.msg_length = msg_length;
 449         insn->bits3.urb_gen5.end_of_thread = end_of_thread;
 450         if (intel->gen >= 6) {
 451            /* For SNB, the SFID bits moved to the condmod bits, and
 452             * EOT stayed in bits3 above.  Does the EOT bit setting
 453             * below on Ironlake even do anything?
 454             */
 455            insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
 456         } else {
 457            insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
 458            insn->bits2.send_gen5.end_of_thread = end_of_thread;
 459         }
 460     } else {
 461         insn->bits3.urb.opcode = 0;     /* ? */
 462         insn->bits3.urb.offset = offset;
 463         insn->bits3.urb.swizzle_control = swizzle_control;
 464         insn->bits3.urb.allocate = allocate;
 465         insn->bits3.urb.used = used;    /* ? */
 466         insn->bits3.urb.complete = complete;
 467         insn->bits3.urb.response_length = response_length;
 468         insn->bits3.urb.msg_length = msg_length;
 469         insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
 470         insn->bits3.urb.end_of_thread = end_of_thread;
 471     }
 472 }
 473
 474 static void brw_set_dp_write_message( struct brw_context *brw,
 475                                       struct brw_instruction *insn,
 476                                       GLuint binding_table_index,
 477                                       GLuint msg_control,
 478                                       GLuint msg_type,
 479                                       GLuint msg_length,
 480                                       GLboolean header_present,
 481                                       GLuint pixel_scoreboard_clear,
 482                                       GLuint response_length,
 483                                       GLuint end_of_thread,
 484                                       GLuint send_commit_msg)
 485 {
 486    struct intel_context *intel = &brw->intel;
 487    brw_set_src1(insn, brw_imm_ud(0));
 488
 489    if (intel->gen >= 6) {
 490        insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
 491        insn->bits3.dp_render_cache.msg_control = msg_control;
 492        insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
 493        insn->bits3.dp_render_cache.msg_type = msg_type;
 494        insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
 495        insn->bits3.dp_render_cache.header_present = header_present;
 496        insn->bits3.dp_render_cache.response_length = response_length;
 497        insn->bits3.dp_render_cache.msg_length = msg_length;
 498        insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
 499
 500        /* We always use the render cache for write messages */
 501        insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
 502    } else if (intel->gen == 5) {
 503        insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 504        insn->bits3.dp_write_gen5.msg_control = msg_control;
 505        insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
 506        insn->bits3.dp_write_gen5.msg_type = msg_type;
 507        insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 508        insn->bits3.dp_write_gen5.header_present = header_present;
 509        insn->bits3.dp_write_gen5.response_length = response_length;
 510        insn->bits3.dp_write_gen5.msg_length = msg_length;
 511        insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
 512        insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
 513        insn->bits2.send_gen5.end_of_thread = end_of_thread;
 514    } else {
 515        insn->bits3.dp_write.binding_table_index = binding_table_index;
 516        insn->bits3.dp_write.msg_control = msg_control;
 517        insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
 518        insn->bits3.dp_write.msg_type = msg_type;
 519        insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 520        insn->bits3.dp_write.response_length = response_length;
 521        insn->bits3.dp_write.msg_length = msg_length;
 522        insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
 523        insn->bits3.dp_write.end_of_thread = end_of_thread;
 524    }
 525 }
 526
 527 static void
 528 brw_set_dp_read_message(struct brw_context *brw,
 529                         struct brw_instruction *insn,
 530                         GLuint binding_table_index,
 531                         GLuint msg_control,
 532                         GLuint msg_type,
 533                         GLuint target_cache,
 534                         GLuint msg_length,
 535                         GLuint response_length)
 536 {
 537    struct intel_context *intel = &brw->intel;
 538    brw_set_src1(insn, brw_imm_d(0));
 539
 540    if (intel->gen >= 6) {
 541        uint32_t target_function;
 542
 543        if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
 544           target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
 545        else
 546           target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
 547
 548        insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
 549        insn->bits3.dp_render_cache.msg_control = msg_control;
 550        insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0;
 551        insn->bits3.dp_render_cache.msg_type = msg_type;
 552        insn->bits3.dp_render_cache.send_commit_msg = 0;
 553        insn->bits3.dp_render_cache.header_present = 1;
 554        insn->bits3.dp_render_cache.response_length = response_length;
 555        insn->bits3.dp_render_cache.msg_length = msg_length;
 556        insn->bits3.dp_render_cache.end_of_thread = 0;
 557        insn->header.destreg__conditionalmod = target_function;
 558    } else if (intel->gen == 5) {
 559        insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 560        insn->bits3.dp_read_gen5.msg_control = msg_control;
 561        insn->bits3.dp_read_gen5.msg_type = msg_type;
 562        insn->bits3.dp_read_gen5.target_cache = target_cache;
 563        insn->bits3.dp_read_gen5.header_present = 1;
 564        insn->bits3.dp_read_gen5.response_length = response_length;
 565        insn->bits3.dp_read_gen5.msg_length = msg_length;
 566        insn->bits3.dp_read_gen5.pad1 = 0;
 567        insn->bits3.dp_read_gen5.end_of_thread = 0;
 568        insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
 569        insn->bits2.send_gen5.end_of_thread = 0;
 570    } else if (intel->is_g4x) {
 571        insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 572        insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 573        insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 574        insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 575        insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
 576        insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
 577        insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
 578        insn->bits3.dp_read_g4x.pad1 = 0;
 579        insn->bits3.dp_read_g4x.end_of_thread = 0;
 580    } else {
 581        insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 582        insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 583        insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 584        insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 585        insn->bits3.dp_read.response_length = response_length;  /*16:19*/
 586        insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
 587        insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
 588        insn->bits3.dp_read.pad1 = 0;  /*28:30*/
 589        insn->bits3.dp_read.end_of_thread = 0;  /*31*/
 590    }
 591 }
 592
 593 static void brw_set_sampler_message(struct brw_context *brw,
 594                                     struct brw_instruction *insn,
 595                                     GLuint binding_table_index,
 596                                     GLuint sampler,
 597                                     GLuint msg_type,
 598                                     GLuint response_length,
 599                                     GLuint msg_length,
 600                                     GLboolean eot,
 601                                     GLuint header_present,
 602                                     GLuint simd_mode)
 603 {
 604    struct intel_context *intel = &brw->intel;
 605    assert(eot == 0);
 606    brw_set_src1(insn, brw_imm_d(0));
 607
 608    if (intel->gen >= 5) {
 609       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 610       insn->bits3.sampler_gen5.sampler = sampler;
 611       insn->bits3.sampler_gen5.msg_type = msg_type;
 612       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 613       insn->bits3.sampler_gen5.header_present = header_present;
 614       insn->bits3.sampler_gen5.response_length = response_length;
 615       insn->bits3.sampler_gen5.msg_length = msg_length;
 616       insn->bits3.sampler_gen5.end_of_thread = eot;
 617       if (intel->gen >= 6)
 618           insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
 619       else {
 620           insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
 621           insn->bits2.send_gen5.end_of_thread = eot;
 622       }
 623    } else if (intel->is_g4x) {
 624       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 625       insn->bits3.sampler_g4x.sampler = sampler;
 626       insn->bits3.sampler_g4x.msg_type = msg_type;
 627       insn->bits3.sampler_g4x.response_length = response_length;
 628       insn->bits3.sampler_g4x.msg_length = msg_length;
 629       insn->bits3.sampler_g4x.end_of_thread = eot;
 630       insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
 631    } else {
 632       insn->bits3.sampler.binding_table_index = binding_table_index;
 633       insn->bits3.sampler.sampler = sampler;
 634       insn->bits3.sampler.msg_type = msg_type;
 635       insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
 636       insn->bits3.sampler.response_length = response_length;
 637       insn->bits3.sampler.msg_length = msg_length;
 638       insn->bits3.sampler.end_of_thread = eot;
 639       insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
 640    }
 641 }
 642
 643
 644
 645 static struct brw_instruction *next_insn( struct brw_compile *p,
 646                                           GLuint opcode )
 647 {
 648    struct brw_instruction *insn;
 649
 650    assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
 651
 652    insn = &p->store[p->nr_insn++];
 653    memcpy(insn, p->current, sizeof(*insn));
 654
 655    /* Reset this one-shot flag:
 656     */
 657
 658    if (p->current->header.destreg__conditionalmod) {
 659       p->current->header.destreg__conditionalmod = 0;
 660       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 661    }
 662
 663    insn->header.opcode = opcode;
 664    return insn;
 665 }
 666
 667
 668 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 669                                          GLuint opcode,
 670                                          struct brw_reg dest,
 671                                          struct brw_reg src )
 672 {
 673    struct brw_instruction *insn = next_insn(p, opcode);
 674    brw_set_dest(p, insn, dest);
 675    brw_set_src0(insn, src);
 676    return insn;
 677 }
 678
 679 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 680                                         GLuint opcode,
 681                                         struct brw_reg dest,
 682                                         struct brw_reg src0,
 683                                         struct brw_reg src1 )
 684 {
 685    struct brw_instruction *insn = next_insn(p, opcode);
 686    brw_set_dest(p, insn, dest);
 687    brw_set_src0(insn, src0);
 688    brw_set_src1(insn, src1);
 689    return insn;
 690 }
 691
 692
 693 /***********************************************************************
 694  * Convenience routines.
 695  */
 696 #define ALU1(OP)                                        \
 697 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 698               struct brw_reg dest,                      \
 699               struct brw_reg src0)                      \
 700 {                                                       \
 701    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 702 }
 703
 704 #define ALU2(OP)                                        \
 705 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 706               struct brw_reg dest,                      \
 707               struct brw_reg src0,                      \
 708               struct brw_reg src1)                      \
 709 {                                                       \
 710    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 711 }
 712
 713 /* Rounding operations (other than RNDD) require two instructions - the first
 714  * stores a rounded value (possibly the wrong way) in the dest register, but
 715  * also sets a per-channel "increment bit" in the flag register.  A predicated
 716  * add of 1.0 fixes dest to contain the desired result.
 717  */
 718 #define ROUND(OP)                                                             \
 719 void brw_##OP(struct brw_compile *p,                                          \
 720               struct brw_reg dest,                                            \
 721               struct brw_reg src)                                             \
 722 {                                                                             \
 723    struct brw_instruction *rnd, *add;                                         \
 724    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 725    brw_set_dest(p, rnd, dest);                                                \
 726    brw_set_src0(rnd, src);                                                    \
 727    rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
 728                                                                               \
 729    add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                             \
 730    add->header.predicate_control = BRW_PREDICATE_NORMAL;                      \
 731 }
 732
 733
 734 ALU1(MOV)
 735 ALU2(SEL)
 736 ALU1(NOT)
 737 ALU2(AND)
 738 ALU2(OR)
 739 ALU2(XOR)
 740 ALU2(SHR)
 741 ALU2(SHL)
 742 ALU2(RSR)
 743 ALU2(RSL)
 744 ALU2(ASR)
 745 ALU1(FRC)
 746 ALU1(RNDD)
 747 ALU2(MAC)
 748 ALU2(MACH)
 749 ALU1(LZD)
 750 ALU2(DP4)
 751 ALU2(DPH)
 752 ALU2(DP3)
 753 ALU2(DP2)
 754 ALU2(LINE)
 755 ALU2(PLN)
 756
 757
 758 ROUND(RNDZ)
 759 ROUND(RNDE)
 760
 761
 762 struct brw_instruction *brw_ADD(struct brw_compile *p,
 763                                 struct brw_reg dest,
 764                                 struct brw_reg src0,
 765                                 struct brw_reg src1)
 766 {
 767    /* 6.2.2: add */
 768    if (src0.type == BRW_REGISTER_TYPE_F ||
 769        (src0.file == BRW_IMMEDIATE_VALUE &&
 770         src0.type == BRW_REGISTER_TYPE_VF)) {
 771       assert(src1.type != BRW_REGISTER_TYPE_UD);
 772       assert(src1.type != BRW_REGISTER_TYPE_D);
 773    }
 774
 775    if (src1.type == BRW_REGISTER_TYPE_F ||
 776        (src1.file == BRW_IMMEDIATE_VALUE &&
 777         src1.type == BRW_REGISTER_TYPE_VF)) {
 778       assert(src0.type != BRW_REGISTER_TYPE_UD);
 779       assert(src0.type != BRW_REGISTER_TYPE_D);
 780    }
 781
 782    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 783 }
 784
 785 struct brw_instruction *brw_MUL(struct brw_compile *p,
 786                                 struct brw_reg dest,
 787                                 struct brw_reg src0,
 788                                 struct brw_reg src1)
 789 {
 790    /* 6.32.38: mul */
 791    if (src0.type == BRW_REGISTER_TYPE_D ||
 792        src0.type == BRW_REGISTER_TYPE_UD ||
 793        src1.type == BRW_REGISTER_TYPE_D ||
 794        src1.type == BRW_REGISTER_TYPE_UD) {
 795       assert(dest.type != BRW_REGISTER_TYPE_F);
 796    }
 797
 798    if (src0.type == BRW_REGISTER_TYPE_F ||
 799        (src0.file == BRW_IMMEDIATE_VALUE &&
 800         src0.type == BRW_REGISTER_TYPE_VF)) {
 801       assert(src1.type != BRW_REGISTER_TYPE_UD);
 802       assert(src1.type != BRW_REGISTER_TYPE_D);
 803    }
 804
 805    if (src1.type == BRW_REGISTER_TYPE_F ||
 806        (src1.file == BRW_IMMEDIATE_VALUE &&
 807         src1.type == BRW_REGISTER_TYPE_VF)) {
 808       assert(src0.type != BRW_REGISTER_TYPE_UD);
 809       assert(src0.type != BRW_REGISTER_TYPE_D);
 810    }
 811
 812    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 813           src0.nr != BRW_ARF_ACCUMULATOR);
 814    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 815           src1.nr != BRW_ARF_ACCUMULATOR);
 816
 817    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
 818 }
 819
 820
 821 void brw_NOP(struct brw_compile *p)
 822 {
 823    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
 824    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 825    brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 826    brw_set_src1(insn, brw_imm_ud(0x0));
 827 }
 828
 829
 830
 831
 832
 833 /***********************************************************************
 834  * Comparisons, if/else/endif
 835  */
 836
 837 struct brw_instruction *brw_JMPI(struct brw_compile *p,
 838                                  struct brw_reg dest,
 839                                  struct brw_reg src0,
 840                                  struct brw_reg src1)
 841 {
 842    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 843
 844    insn->header.execution_size = 1;
 845    insn->header.compression_control = BRW_COMPRESSION_NONE;
 846    insn->header.mask_control = BRW_MASK_DISABLE;
 847
 848    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 849
 850    return insn;
 851 }
 852
 853 /* EU takes the value from the flag register and pushes it onto some
 854  * sort of a stack (presumably merging with any flag value already on
 855  * the stack).  Within an if block, the flags at the top of the stack
 856  * control execution on each channel of the unit, eg. on each of the
 857  * 16 pixel values in our wm programs.
 858  *
 859  * When the matching 'else' instruction is reached (presumably by
 860  * countdown of the instruction count patched in by our ELSE/ENDIF
 861  * functions), the relevent flags are inverted.
 862  *
 863  * When the matching 'endif' instruction is reached, the flags are
 864  * popped off.  If the stack is now empty, normal execution resumes.
 865  *
 866  * No attempt is made to deal with stack overflow (14 elements?).
 867  */
 868 struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 869 {
 870    struct intel_context *intel = &p->brw->intel;
 871    struct brw_instruction *insn;
 872
 873    if (p->single_program_flow) {
 874       assert(execute_size == BRW_EXECUTE_1);
 875
 876       insn = next_insn(p, BRW_OPCODE_ADD);
 877       insn->header.predicate_inverse = 1;
 878    } else {
 879       insn = next_insn(p, BRW_OPCODE_IF);
 880    }
 881
 882    /* Override the defaults for this instruction:
 883     */
 884    if (intel->gen < 6) {
 885       brw_set_dest(p, insn, brw_ip_reg());
 886       brw_set_src0(insn, brw_ip_reg());
 887       brw_set_src1(insn, brw_imm_d(0x0));
 888    } else {
 889       brw_set_dest(p, insn, brw_imm_w(0));
 890       insn->bits1.branch_gen6.jump_count = 0;
 891       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 892       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 893    }
 894
 895    insn->header.execution_size = execute_size;
 896    insn->header.compression_control = BRW_COMPRESSION_NONE;
 897    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
 898    insn->header.mask_control = BRW_MASK_ENABLE;
 899    if (!p->single_program_flow)
 900        insn->header.thread_control = BRW_THREAD_SWITCH;
 901
 902    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 903
 904    return insn;
 905 }
 906
 907 struct brw_instruction *
 908 gen6_IF(struct brw_compile *p, uint32_t conditional,
 909         struct brw_reg src0, struct brw_reg src1)
 910 {
 911    struct brw_instruction *insn;
 912
 913    insn = next_insn(p, BRW_OPCODE_IF);
 914
 915    brw_set_dest(p, insn, brw_imm_w(0));
 916    insn->header.execution_size = BRW_EXECUTE_8;
 917    insn->bits1.branch_gen6.jump_count = 0;
 918    brw_set_src0(insn, src0);
 919    brw_set_src1(insn, src1);
 920
 921    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
 922    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
 923    insn->header.destreg__conditionalmod = conditional;
 924
 925    if (!p->single_program_flow)
 926        insn->header.thread_control = BRW_THREAD_SWITCH;
 927
 928    return insn;
 929 }
 930
 931 struct brw_instruction *brw_ELSE(struct brw_compile *p,
 932                                  struct brw_instruction *if_insn)
 933 {
 934    struct intel_context *intel = &p->brw->intel;
 935    struct brw_instruction *insn;
 936    GLuint br = 1;
 937
 938    /* jump count is for 64bit data chunk each, so one 128bit
 939       instruction requires 2 chunks. */
 940    if (intel->gen >= 5)
 941       br = 2;
 942
 943    if (p->single_program_flow) {
 944       insn = next_insn(p, BRW_OPCODE_ADD);
 945    } else {
 946       insn = next_insn(p, BRW_OPCODE_ELSE);
 947    }
 948
 949    if (intel->gen < 6) {
 950       brw_set_dest(p, insn, brw_ip_reg());
 951       brw_set_src0(insn, brw_ip_reg());
 952       brw_set_src1(insn, brw_imm_d(0x0));
 953    } else {
 954       brw_set_dest(p, insn, brw_imm_w(0));
 955       insn->bits1.branch_gen6.jump_count = 0;
 956       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 957       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 958    }
 959
 960    insn->header.compression_control = BRW_COMPRESSION_NONE;
 961    insn->header.execution_size = if_insn->header.execution_size;
 962    insn->header.mask_control = BRW_MASK_ENABLE;
 963    if (!p->single_program_flow)
 964        insn->header.thread_control = BRW_THREAD_SWITCH;
 965
 966    /* Patch the if instruction to point at this instruction.
 967     */
 968    if (p->single_program_flow) {
 969       assert(if_insn->header.opcode == BRW_OPCODE_ADD);
 970
 971       if_insn->bits3.ud = (insn - if_insn + 1) * 16;
 972    } else {
 973       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 974
 975       if (intel->gen < 6) {
 976          if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
 977          if_insn->bits3.if_else.pop_count = 0;
 978          if_insn->bits3.if_else.pad0 = 0;
 979       } else {
 980          if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
 981       }
 982    }
 983
 984    return insn;
 985 }
 986
 987 void brw_ENDIF(struct brw_compile *p,
 988                struct brw_instruction *patch_insn)
 989 {
 990    struct intel_context *intel = &p->brw->intel;
 991    GLuint br = 1;
 992
 993    if (intel->gen >= 5)
 994       br = 2;
 995
 996    if (p->single_program_flow) {
 997       /* In single program flow mode, there's no need to execute an ENDIF,
 998        * since we don't need to do any stack operations, and if we're executing
 999        * currently, we want to just continue executing.
1000        */
1001       struct brw_instruction *next = &p->store[p->nr_insn];
1002
1003       assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
1004
1005       patch_insn->bits3.ud = (next - patch_insn) * 16;
1006    } else {
1007       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
1008
1009       if (intel->gen < 6) {
1010          brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1011          brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1012          brw_set_src1(insn, brw_imm_d(0x0));
1013       } else {
1014          brw_set_dest(p, insn, brw_imm_w(0));
1015          brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1016          brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1017       }
1018
1019       insn->header.compression_control = BRW_COMPRESSION_NONE;
1020       insn->header.execution_size = patch_insn->header.execution_size;
1021       insn->header.mask_control = BRW_MASK_ENABLE;
1022       insn->header.thread_control = BRW_THREAD_SWITCH;
1023
1024       if (intel->gen < 6)
1025          assert(patch_insn->bits3.if_else.jump_count == 0);
1026       else
1027          assert(patch_insn->bits1.branch_gen6.jump_count == 0);
1028
1029       /* Patch the if or else instructions to point at this or the next
1030        * instruction respectively.
1031        */
1032       if (patch_insn->header.opcode == BRW_OPCODE_IF) {
1033          if (intel->gen < 6) {
1034             /* Turn it into an IFF, which means no mask stack operations for
1035              * all-false and jumping past the ENDIF.
1036              */
1037             patch_insn->header.opcode = BRW_OPCODE_IFF;
1038             patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1039             patch_insn->bits3.if_else.pop_count = 0;
1040             patch_insn->bits3.if_else.pad0 = 0;
1041          } else {
1042             /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1043             patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1044          }
1045       } else {
1046          assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
1047          if (intel->gen < 6) {
1048             /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1049              * matching ENDIF.
1050              */
1051             patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1052             patch_insn->bits3.if_else.pop_count = 1;
1053             patch_insn->bits3.if_else.pad0 = 0;
1054          } else {
1055             /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1056             patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1057          }
1058       }
1059
1060       /* Also pop item off the stack in the endif instruction:
1061        */
1062       if (intel->gen < 6) {
1063          insn->bits3.if_else.jump_count = 0;
1064          insn->bits3.if_else.pop_count = 1;
1065          insn->bits3.if_else.pad0 = 0;
1066       } else {
1067          insn->bits1.branch_gen6.jump_count = 2;
1068       }
1069    }
1070 }
1071
1072 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1073 {
1074    struct intel_context *intel = &p->brw->intel;
1075    struct brw_instruction *insn;
1076
1077    insn = next_insn(p, BRW_OPCODE_BREAK);
1078    if (intel->gen >= 6) {
1079       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1080       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1081       brw_set_src1(insn, brw_imm_d(0x0));
1082    } else {
1083       brw_set_dest(p, insn, brw_ip_reg());
1084       brw_set_src0(insn, brw_ip_reg());
1085       brw_set_src1(insn, brw_imm_d(0x0));
1086       insn->bits3.if_else.pad0 = 0;
1087       insn->bits3.if_else.pop_count = pop_count;
1088    }
1089    insn->header.compression_control = BRW_COMPRESSION_NONE;
1090    insn->header.execution_size = BRW_EXECUTE_8;
1091
1092    return insn;
1093 }
1094
1095 struct brw_instruction *gen6_CONT(struct brw_compile *p,
1096                                   struct brw_instruction *do_insn)
1097 {
1098    struct brw_instruction *insn;
1099    int br = 2;
1100
1101    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1102    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1103    brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1104    brw_set_dest(p, insn, brw_ip_reg());
1105    brw_set_src0(insn, brw_ip_reg());
1106    brw_set_src1(insn, brw_imm_d(0x0));
1107
1108    insn->bits3.break_cont.uip = br * (do_insn - insn);
1109
1110    insn->header.compression_control = BRW_COMPRESSION_NONE;
1111    insn->header.execution_size = BRW_EXECUTE_8;
1112    return insn;
1113 }
1114
1115 struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1116 {
1117    struct brw_instruction *insn;
1118    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1119    brw_set_dest(p, insn, brw_ip_reg());
1120    brw_set_src0(insn, brw_ip_reg());
1121    brw_set_src1(insn, brw_imm_d(0x0));
1122    insn->header.compression_control = BRW_COMPRESSION_NONE;
1123    insn->header.execution_size = BRW_EXECUTE_8;
1124    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1125    insn->bits3.if_else.pad0 = 0;
1126    insn->bits3.if_else.pop_count = pop_count;
1127    return insn;
1128 }
1129
1130 /* DO/WHILE loop:
1131  *
1132  * The DO/WHILE is just an unterminated loop -- break or continue are
1133  * used for control within the loop.  We have a few ways they can be
1134  * done.
1135  *
1136  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1137  * jip and no DO instruction.
1138  *
1139  * For non-uniform control flow pre-gen6, there's a DO instruction to
1140  * push the mask, and a WHILE to jump back, and BREAK to get out and
1141  * pop the mask.
1142  *
1143  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1144  * just points back to the first instruction of the loop.
1145  */
1146 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1147 {
1148    struct intel_context *intel = &p->brw->intel;
1149
1150    if (intel->gen >= 6 || p->single_program_flow) {
1151       return &p->store[p->nr_insn];
1152    } else {
1153       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1154
1155       /* Override the defaults for this instruction:
1156        */
1157       brw_set_dest(p, insn, brw_null_reg());
1158       brw_set_src0(insn, brw_null_reg());
1159       brw_set_src1(insn, brw_null_reg());
1160
1161       insn->header.compression_control = BRW_COMPRESSION_NONE;
1162       insn->header.execution_size = execute_size;
1163       insn->header.predicate_control = BRW_PREDICATE_NONE;
1164       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1165       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1166
1167       return insn;
1168    }
1169 }
1170
1171
1172
1173 struct brw_instruction *brw_WHILE(struct brw_compile *p,
1174                                   struct brw_instruction *do_insn)
1175 {
1176    struct intel_context *intel = &p->brw->intel;
1177    struct brw_instruction *insn;
1178    GLuint br = 1;
1179
1180    if (intel->gen >= 5)
1181       br = 2;
1182
1183    if (intel->gen >= 6) {
1184       insn = next_insn(p, BRW_OPCODE_WHILE);
1185
1186       brw_set_dest(p, insn, brw_imm_w(0));
1187       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1188       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1189       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1190
1191       insn->header.execution_size = do_insn->header.execution_size;
1192       assert(insn->header.execution_size == BRW_EXECUTE_8);
1193    } else {
1194       if (p->single_program_flow) {
1195          insn = next_insn(p, BRW_OPCODE_ADD);
1196
1197          brw_set_dest(p, insn, brw_ip_reg());
1198          brw_set_src0(insn, brw_ip_reg());
1199          brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
1200          insn->header.execution_size = BRW_EXECUTE_1;
1201       } else {
1202          insn = next_insn(p, BRW_OPCODE_WHILE);
1203
1204          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1205
1206          brw_set_dest(p, insn, brw_ip_reg());
1207          brw_set_src0(insn, brw_ip_reg());
1208          brw_set_src1(insn, brw_imm_d(0));
1209
1210          insn->header.execution_size = do_insn->header.execution_size;
1211          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1212          insn->bits3.if_else.pop_count = 0;
1213          insn->bits3.if_else.pad0 = 0;
1214       }
1215    }
1216    insn->header.compression_control = BRW_COMPRESSION_NONE;
1217    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1218
1219    return insn;
1220 }
1221
1222
1223 /* FORWARD JUMPS:
1224  */
1225 void brw_land_fwd_jump(struct brw_compile *p,
1226                        struct brw_instruction *jmp_insn)
1227 {
1228    struct intel_context *intel = &p->brw->intel;
1229    struct brw_instruction *landing = &p->store[p->nr_insn];
1230    GLuint jmpi = 1;
1231
1232    if (intel->gen >= 5)
1233        jmpi = 2;
1234
1235    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1236    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1237
1238    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1239 }
1240
1241
1242
1243 /* To integrate with the above, it makes sense that the comparison
1244  * instruction should populate the flag register.  It might be simpler
1245  * just to use the flag reg for most WM tasks?
1246  */
1247 void brw_CMP(struct brw_compile *p,
1248              struct brw_reg dest,
1249              GLuint conditional,
1250              struct brw_reg src0,
1251              struct brw_reg src1)
1252 {
1253    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1254
1255    insn->header.destreg__conditionalmod = conditional;
1256    brw_set_dest(p, insn, dest);
1257    brw_set_src0(insn, src0);
1258    brw_set_src1(insn, src1);
1259
1260 /*    guess_execution_size(insn, src0); */
1261
1262
1263    /* Make it so that future instructions will use the computed flag
1264     * value until brw_set_predicate_control_flag_value() is called
1265     * again.
1266     */
1267    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1268        dest.nr == 0) {
1269       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1270       p->flag_value = 0xff;
1271    }
1272 }
1273
1274 /* Issue 'wait' instruction for n1, host could program MMIO
1275    to wake up thread. */
1276 void brw_WAIT (struct brw_compile *p)
1277 {
1278    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1279    struct brw_reg src = brw_notification_1_reg();
1280
1281    brw_set_dest(p, insn, src);
1282    brw_set_src0(insn, src);
1283    brw_set_src1(insn, brw_null_reg());
1284    insn->header.execution_size = 0; /* must */
1285    insn->header.predicate_control = 0;
1286    insn->header.compression_control = 0;
1287 }
1288
1289
1290 /***********************************************************************
1291  * Helpers for the various SEND message types:
1292  */
1293
1294 /** Extended math function, float[8].
1295  */
1296 void brw_math( struct brw_compile *p,
1297                struct brw_reg dest,
1298                GLuint function,
1299                GLuint saturate,
1300                GLuint msg_reg_nr,
1301                struct brw_reg src,
1302                GLuint data_type,
1303                GLuint precision )
1304 {
1305    struct intel_context *intel = &p->brw->intel;
1306
1307    if (intel->gen >= 6) {
1308       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1309
1310       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1311       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1312
1313       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1314       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1315
1316       /* Source modifiers are ignored for extended math instructions. */
1317       assert(!src.negate);
1318       assert(!src.abs);
1319
1320       if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1321           function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1322          assert(src.type == BRW_REGISTER_TYPE_F);
1323       }
1324
1325       /* Math is the same ISA format as other opcodes, except that CondModifier
1326        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1327        */
1328       insn->header.destreg__conditionalmod = function;
1329       insn->header.saturate = saturate;
1330
1331       brw_set_dest(p, insn, dest);
1332       brw_set_src0(insn, src);
1333       brw_set_src1(insn, brw_null_reg());
1334    } else {
1335       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1336       GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1337       GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1338       /* Example code doesn't set predicate_control for send
1339        * instructions.
1340        */
1341       insn->header.predicate_control = 0;
1342       insn->header.destreg__conditionalmod = msg_reg_nr;
1343
1344       brw_set_dest(p, insn, dest);
1345       brw_set_src0(insn, src);
1346       brw_set_math_message(p->brw,
1347                            insn,
1348                            msg_length, response_length,
1349                            function,
1350                            BRW_MATH_INTEGER_UNSIGNED,
1351                            precision,
1352                            saturate,
1353                            data_type);
1354    }
1355 }
1356
1357 /** Extended math function, float[8].
1358  */
1359 void brw_math2(struct brw_compile *p,
1360                struct brw_reg dest,
1361                GLuint function,
1362                struct brw_reg src0,
1363                struct brw_reg src1)
1364 {
1365    struct intel_context *intel = &p->brw->intel;
1366    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1367
1368    assert(intel->gen >= 6);
1369    (void) intel;
1370
1371
1372    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1373    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1374    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1375
1376    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1377    assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1378    assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1379
1380    if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1381        function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1382       assert(src0.type == BRW_REGISTER_TYPE_F);
1383       assert(src1.type == BRW_REGISTER_TYPE_F);
1384    }
1385
1386    /* Source modifiers are ignored for extended math instructions. */
1387    assert(!src0.negate);
1388    assert(!src0.abs);
1389    assert(!src1.negate);
1390    assert(!src1.abs);
1391
1392    /* Math is the same ISA format as other opcodes, except that CondModifier
1393     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1394     */
1395    insn->header.destreg__conditionalmod = function;
1396
1397    brw_set_dest(p, insn, dest);
1398    brw_set_src0(insn, src0);
1399    brw_set_src1(insn, src1);
1400 }
1401
1402 /**
1403  * Extended math function, float[16].
1404  * Use 2 send instructions.
1405  */
1406 void brw_math_16( struct brw_compile *p,
1407                   struct brw_reg dest,
1408                   GLuint function,
1409                   GLuint saturate,
1410                   GLuint msg_reg_nr,
1411                   struct brw_reg src,
1412                   GLuint precision )
1413 {
1414    struct intel_context *intel = &p->brw->intel;
1415    struct brw_instruction *insn;
1416    GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1417    GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1418
1419    if (intel->gen >= 6) {
1420       insn = next_insn(p, BRW_OPCODE_MATH);
1421
1422       /* Math is the same ISA format as other opcodes, except that CondModifier
1423        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1424        */
1425       insn->header.destreg__conditionalmod = function;
1426       insn->header.saturate = saturate;
1427
1428       /* Source modifiers are ignored for extended math instructions. */
1429       assert(!src.negate);
1430       assert(!src.abs);
1431
1432       brw_set_dest(p, insn, dest);
1433       brw_set_src0(insn, src);
1434       brw_set_src1(insn, brw_null_reg());
1435       return;
1436    }
1437
1438    /* First instruction:
1439     */
1440    brw_push_insn_state(p);
1441    brw_set_predicate_control_flag_value(p, 0xff);
1442    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1443
1444    insn = next_insn(p, BRW_OPCODE_SEND);
1445    insn->header.destreg__conditionalmod = msg_reg_nr;
1446
1447    brw_set_dest(p, insn, dest);
1448    brw_set_src0(insn, src);
1449    brw_set_math_message(p->brw,
1450                         insn,
1451                         msg_length, response_length,
1452                         function,
1453                         BRW_MATH_INTEGER_UNSIGNED,
1454                         precision,
1455                         saturate,
1456                         BRW_MATH_DATA_VECTOR);
1457
1458    /* Second instruction:
1459     */
1460    insn = next_insn(p, BRW_OPCODE_SEND);
1461    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1462    insn->header.destreg__conditionalmod = msg_reg_nr+1;
1463
1464    brw_set_dest(p, insn, offset(dest,1));
1465    brw_set_src0(insn, src);
1466    brw_set_math_message(p->brw,
1467                         insn,
1468                         msg_length, response_length,
1469                         function,
1470                         BRW_MATH_INTEGER_UNSIGNED,
1471                         precision,
1472                         saturate,
1473                         BRW_MATH_DATA_VECTOR);
1474
1475    brw_pop_insn_state(p);
1476 }
1477
1478
1479 /**
1480  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1481  * using a constant offset per channel.
1482  *
1483  * The offset must be aligned to oword size (16 bytes).  Used for
1484  * register spilling.
1485  */
1486 void brw_oword_block_write_scratch(struct brw_compile *p,
1487                                    struct brw_reg mrf,
1488                                    int num_regs,
1489                                    GLuint offset)
1490 {
1491    struct intel_context *intel = &p->brw->intel;
1492    uint32_t msg_control, msg_type;
1493    int mlen;
1494
1495    if (intel->gen >= 6)
1496       offset /= 16;
1497
1498    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1499
1500    if (num_regs == 1) {
1501       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1502       mlen = 2;
1503    } else {
1504       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1505       mlen = 3;
1506    }
1507
1508    /* Set up the message header.  This is g0, with g0.2 filled with
1509     * the offset.  We don't want to leave our offset around in g0 or
1510     * it'll screw up texture samples, so set it up inside the message
1511     * reg.
1512     */
1513    {
1514       brw_push_insn_state(p);
1515       brw_set_mask_control(p, BRW_MASK_DISABLE);
1516       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1517
1518       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1519
1520       /* set message header global offset field (reg 0, element 2) */
1521       brw_MOV(p,
1522               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1523                                   mrf.nr,
1524                                   2), BRW_REGISTER_TYPE_UD),
1525               brw_imm_ud(offset));
1526
1527       brw_pop_insn_state(p);
1528    }
1529
1530    {
1531       struct brw_reg dest;
1532       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1533       int send_commit_msg;
1534       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1535                                          BRW_REGISTER_TYPE_UW);
1536
1537       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1538          insn->header.compression_control = BRW_COMPRESSION_NONE;
1539          src_header = vec16(src_header);
1540       }
1541       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1542       insn->header.destreg__conditionalmod = mrf.nr;
1543
1544       /* Until gen6, writes followed by reads from the same location
1545        * are not guaranteed to be ordered unless write_commit is set.
1546        * If set, then a no-op write is issued to the destination
1547        * register to set a dependency, and a read from the destination
1548        * can be used to ensure the ordering.
1549        *
1550        * For gen6, only writes between different threads need ordering
1551        * protection.  Our use of DP writes is all about register
1552        * spilling within a thread.
1553        */
1554       if (intel->gen >= 6) {
1555          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1556          send_commit_msg = 0;
1557       } else {
1558          dest = src_header;
1559          send_commit_msg = 1;
1560       }
1561
1562       brw_set_dest(p, insn, dest);
1563       if (intel->gen >= 6) {
1564          brw_set_src0(insn, mrf);
1565       } else {
1566          brw_set_src0(insn, brw_null_reg());
1567       }
1568
1569       if (intel->gen >= 6)
1570          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1571       else
1572          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1573
1574       brw_set_dp_write_message(p->brw,
1575                                insn,
1576                                255, /* binding table index (255=stateless) */
1577                                msg_control,
1578                                msg_type,
1579                                mlen,
1580                                GL_TRUE, /* header_present */
1581                                0, /* pixel scoreboard */
1582                                send_commit_msg, /* response_length */
1583                                0, /* eot */
1584                                send_commit_msg);
1585    }
1586 }
1587
1588
1589 /**
1590  * Read a block of owords (half a GRF each) from the scratch buffer
1591  * using a constant index per channel.
1592  *
1593  * Offset must be aligned to oword size (16 bytes).  Used for register
1594  * spilling.
1595  */
1596 void
1597 brw_oword_block_read_scratch(struct brw_compile *p,
1598                              struct brw_reg dest,
1599                              struct brw_reg mrf,
1600                              int num_regs,
1601                              GLuint offset)
1602 {
1603    struct intel_context *intel = &p->brw->intel;
1604    uint32_t msg_control;
1605    int rlen;
1606
1607    if (intel->gen >= 6)
1608       offset /= 16;
1609
1610    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1611    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1612
1613    if (num_regs == 1) {
1614       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1615       rlen = 1;
1616    } else {
1617       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1618       rlen = 2;
1619    }
1620
1621    {
1622       brw_push_insn_state(p);
1623       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1624       brw_set_mask_control(p, BRW_MASK_DISABLE);
1625
1626       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1627
1628       /* set message header global offset field (reg 0, element 2) */
1629       brw_MOV(p,
1630               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1631                                   mrf.nr,
1632                                   2), BRW_REGISTER_TYPE_UD),
1633               brw_imm_ud(offset));
1634
1635       brw_pop_insn_state(p);
1636    }
1637
1638    {
1639       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1640
1641       assert(insn->header.predicate_control == 0);
1642       insn->header.compression_control = BRW_COMPRESSION_NONE;
1643       insn->header.destreg__conditionalmod = mrf.nr;
1644
1645       brw_set_dest(p, insn, dest);      /* UW? */
1646       if (intel->gen >= 6) {
1647          brw_set_src0(insn, mrf);
1648       } else {
1649          brw_set_src0(insn, brw_null_reg());
1650       }
1651
1652       brw_set_dp_read_message(p->brw,
1653                               insn,
1654                               255, /* binding table index (255=stateless) */
1655                               msg_control,
1656                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1657                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1658                               1, /* msg_length */
1659                               rlen);
1660    }
1661 }
1662
1663 /**
1664  * Read a float[4] vector from the data port Data Cache (const buffer).
1665  * Location (in buffer) should be a multiple of 16.
1666  * Used for fetching shader constants.
1667  */
1668 void brw_oword_block_read(struct brw_compile *p,
1669                           struct brw_reg dest,
1670                           struct brw_reg mrf,
1671                           uint32_t offset,
1672                           uint32_t bind_table_index)
1673 {
1674    struct intel_context *intel = &p->brw->intel;
1675
1676    /* On newer hardware, offset is in units of owords. */
1677    if (intel->gen >= 6)
1678       offset /= 16;
1679
1680    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1681
1682    brw_push_insn_state(p);
1683    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1684    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1685    brw_set_mask_control(p, BRW_MASK_DISABLE);
1686
1687    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1688
1689    /* set message header global offset field (reg 0, element 2) */
1690    brw_MOV(p,
1691            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1692                                mrf.nr,
1693                                2), BRW_REGISTER_TYPE_UD),
1694            brw_imm_ud(offset));
1695
1696    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1697    insn->header.destreg__conditionalmod = mrf.nr;
1698
1699    /* cast dest to a uword[8] vector */
1700    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1701
1702    brw_set_dest(p, insn, dest);
1703    if (intel->gen >= 6) {
1704       brw_set_src0(insn, mrf);
1705    } else {
1706       brw_set_src0(insn, brw_null_reg());
1707    }
1708
1709    brw_set_dp_read_message(p->brw,
1710                            insn,
1711                            bind_table_index,
1712                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1713                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1714                            0, /* source cache = data cache */
1715                            1, /* msg_length */
1716                            1); /* response_length (1 reg, 2 owords!) */
1717
1718    brw_pop_insn_state(p);
1719 }
1720
1721 /**
1722  * Read a set of dwords from the data port Data Cache (const buffer).
1723  *
1724  * Location (in buffer) appears as UD offsets in the register after
1725  * the provided mrf header reg.
1726  */
1727 void brw_dword_scattered_read(struct brw_compile *p,
1728                               struct brw_reg dest,
1729                               struct brw_reg mrf,
1730                               uint32_t bind_table_index)
1731 {
1732    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1733
1734    brw_push_insn_state(p);
1735    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1736    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1737    brw_set_mask_control(p, BRW_MASK_DISABLE);
1738    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1739    brw_pop_insn_state(p);
1740
1741    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1742    insn->header.destreg__conditionalmod = mrf.nr;
1743
1744    /* cast dest to a uword[8] vector */
1745    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1746
1747    brw_set_dest(p, insn, dest);
1748    brw_set_src0(insn, brw_null_reg());
1749
1750    brw_set_dp_read_message(p->brw,
1751                            insn,
1752                            bind_table_index,
1753                            BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1754                            BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1755                            0, /* source cache = data cache */
1756                            2, /* msg_length */
1757                            1); /* response_length */
1758 }
1759
1760
1761
1762 /**
1763  * Read float[4] constant(s) from VS constant buffer.
1764  * For relative addressing, two float[4] constants will be read into 'dest'.
1765  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1766  */
1767 void brw_dp_READ_4_vs(struct brw_compile *p,
1768                       struct brw_reg dest,
1769                       GLuint location,
1770                       GLuint bind_table_index)
1771 {
1772    struct intel_context *intel = &p->brw->intel;
1773    struct brw_instruction *insn;
1774    GLuint msg_reg_nr = 1;
1775
1776    if (intel->gen >= 6)
1777       location /= 16;
1778
1779    /* Setup MRF[1] with location/offset into const buffer */
1780    brw_push_insn_state(p);
1781    brw_set_access_mode(p, BRW_ALIGN_1);
1782    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1783    brw_set_mask_control(p, BRW_MASK_DISABLE);
1784    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1785    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1786                      BRW_REGISTER_TYPE_UD),
1787            brw_imm_ud(location));
1788    brw_pop_insn_state(p);
1789
1790    insn = next_insn(p, BRW_OPCODE_SEND);
1791
1792    insn->header.predicate_control = BRW_PREDICATE_NONE;
1793    insn->header.compression_control = BRW_COMPRESSION_NONE;
1794    insn->header.destreg__conditionalmod = msg_reg_nr;
1795    insn->header.mask_control = BRW_MASK_DISABLE;
1796
1797    brw_set_dest(p, insn, dest);
1798    if (intel->gen >= 6) {
1799       brw_set_src0(insn, brw_message_reg(msg_reg_nr));
1800    } else {
1801       brw_set_src0(insn, brw_null_reg());
1802    }
1803
1804    brw_set_dp_read_message(p->brw,
1805                            insn,
1806                            bind_table_index,
1807                            0,
1808                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1809                            0, /* source cache = data cache */
1810                            1, /* msg_length */
1811                            1); /* response_length (1 Oword) */
1812 }
1813
1814 /**
1815  * Read a float[4] constant per vertex from VS constant buffer, with
1816  * relative addressing.
1817  */
1818 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1819                                struct brw_reg dest,
1820                                struct brw_reg addr_reg,
1821                                GLuint offset,
1822                                GLuint bind_table_index)
1823 {
1824    struct intel_context *intel = &p->brw->intel;
1825    struct brw_reg src = brw_vec8_grf(0, 0);
1826    int msg_type;
1827
1828    /* Setup MRF[1] with offset into const buffer */
1829    brw_push_insn_state(p);
1830    brw_set_access_mode(p, BRW_ALIGN_1);
1831    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1832    brw_set_mask_control(p, BRW_MASK_DISABLE);
1833    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1834
1835    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1836     * fields ignored.
1837     */
1838    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1839            addr_reg, brw_imm_d(offset));
1840    brw_pop_insn_state(p);
1841
1842    gen6_resolve_implied_move(p, &src, 0);
1843    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1844
1845    insn->header.predicate_control = BRW_PREDICATE_NONE;
1846    insn->header.compression_control = BRW_COMPRESSION_NONE;
1847    insn->header.destreg__conditionalmod = 0;
1848    insn->header.mask_control = BRW_MASK_DISABLE;
1849
1850    brw_set_dest(p, insn, dest);
1851    brw_set_src0(insn, src);
1852
1853    if (intel->gen == 6)
1854       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1855    else if (intel->gen == 5 || intel->is_g4x)
1856       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1857    else
1858       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1859
1860    brw_set_dp_read_message(p->brw,
1861                            insn,
1862                            bind_table_index,
1863                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1864                            msg_type,
1865                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1866                            2, /* msg_length */
1867                            1); /* response_length */
1868 }
1869
1870
1871
1872 void brw_fb_WRITE(struct brw_compile *p,
1873                   int dispatch_width,
1874                   GLuint msg_reg_nr,
1875                   struct brw_reg src0,
1876                   GLuint binding_table_index,
1877                   GLuint msg_length,
1878                   GLuint response_length,
1879                   GLboolean eot,
1880                   GLboolean header_present)
1881 {
1882    struct intel_context *intel = &p->brw->intel;
1883    struct brw_instruction *insn;
1884    GLuint msg_control, msg_type;
1885    struct brw_reg dest;
1886
1887    if (dispatch_width == 16)
1888       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1889    else
1890       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1891
1892    if (intel->gen >= 6 && binding_table_index == 0) {
1893       insn = next_insn(p, BRW_OPCODE_SENDC);
1894    } else {
1895       insn = next_insn(p, BRW_OPCODE_SEND);
1896    }
1897    /* The execution mask is ignored for render target writes. */
1898    insn->header.predicate_control = 0;
1899    insn->header.compression_control = BRW_COMPRESSION_NONE;
1900
1901    if (intel->gen >= 6) {
1902        /* headerless version, just submit color payload */
1903        src0 = brw_message_reg(msg_reg_nr);
1904
1905        msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1906    } else {
1907       insn->header.destreg__conditionalmod = msg_reg_nr;
1908
1909       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1910    }
1911
1912    if (dispatch_width == 16)
1913       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1914    else
1915       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1916
1917    brw_set_dest(p, insn, dest);
1918    brw_set_src0(insn, src0);
1919    brw_set_dp_write_message(p->brw,
1920                             insn,
1921                             binding_table_index,
1922                             msg_control,
1923                             msg_type,
1924                             msg_length,
1925                             header_present,
1926                             1,  /* pixel scoreboard */
1927                             response_length,
1928                             eot,
1929                             0 /* send_commit_msg */);
1930 }
1931
1932
1933 /**
1934  * Texture sample instruction.
1935  * Note: the msg_type plus msg_length values determine exactly what kind
1936  * of sampling operation is performed.  See volume 4, page 161 of docs.
1937  */
1938 void brw_SAMPLE(struct brw_compile *p,
1939                 struct brw_reg dest,
1940                 GLuint msg_reg_nr,
1941                 struct brw_reg src0,
1942                 GLuint binding_table_index,
1943                 GLuint sampler,
1944                 GLuint writemask,
1945                 GLuint msg_type,
1946                 GLuint response_length,
1947                 GLuint msg_length,
1948                 GLboolean eot,
1949                 GLuint header_present,
1950                 GLuint simd_mode)
1951 {
1952    struct intel_context *intel = &p->brw->intel;
1953    GLboolean need_stall = 0;
1954
1955    if (writemask == 0) {
1956       /*printf("%s: zero writemask??\n", __FUNCTION__); */
1957       return;
1958    }
1959
1960    /* Hardware doesn't do destination dependency checking on send
1961     * instructions properly.  Add a workaround which generates the
1962     * dependency by other means.  In practice it seems like this bug
1963     * only crops up for texture samples, and only where registers are
1964     * written by the send and then written again later without being
1965     * read in between.  Luckily for us, we already track that
1966     * information and use it to modify the writemask for the
1967     * instruction, so that is a guide for whether a workaround is
1968     * needed.
1969     */
1970    if (writemask != WRITEMASK_XYZW) {
1971       GLuint dst_offset = 0;
1972       GLuint i, newmask = 0, len = 0;
1973
1974       for (i = 0; i < 4; i++) {
1975          if (writemask & (1<<i))
1976             break;
1977          dst_offset += 2;
1978       }
1979       for (; i < 4; i++) {
1980          if (!(writemask & (1<<i)))
1981             break;
1982          newmask |= 1<<i;
1983          len++;
1984       }
1985
1986       if (newmask != writemask) {
1987          need_stall = 1;
1988          /* printf("need stall %x %x\n", newmask , writemask); */
1989       }
1990       else {
1991          GLboolean dispatch_16 = GL_FALSE;
1992
1993          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1994
1995          guess_execution_size(p, p->current, dest);
1996          if (p->current->header.execution_size == BRW_EXECUTE_16)
1997             dispatch_16 = GL_TRUE;
1998
1999          newmask = ~newmask & WRITEMASK_XYZW;
2000
2001          brw_push_insn_state(p);
2002
2003          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2004          brw_set_mask_control(p, BRW_MASK_DISABLE);
2005
2006          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2007                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2008          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2009
2010          brw_pop_insn_state(p);
2011
2012          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2013          dest = offset(dest, dst_offset);
2014
2015          /* For 16-wide dispatch, masked channels are skipped in the
2016           * response.  For 8-wide, masked channels still take up slots,
2017           * and are just not written to.
2018           */
2019          if (dispatch_16)
2020             response_length = len * 2;
2021       }
2022    }
2023
2024    {
2025       struct brw_instruction *insn;
2026
2027       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2028
2029       insn = next_insn(p, BRW_OPCODE_SEND);
2030       insn->header.predicate_control = 0; /* XXX */
2031       insn->header.compression_control = BRW_COMPRESSION_NONE;
2032       if (intel->gen < 6)
2033           insn->header.destreg__conditionalmod = msg_reg_nr;
2034
2035       brw_set_dest(p, insn, dest);
2036       brw_set_src0(insn, src0);
2037       brw_set_sampler_message(p->brw, insn,
2038                               binding_table_index,
2039                               sampler,
2040                               msg_type,
2041                               response_length,
2042                               msg_length,
2043                               eot,
2044                               header_present,
2045                               simd_mode);
2046    }
2047
2048    if (need_stall) {
2049       struct brw_reg reg = vec8(offset(dest, response_length-1));
2050
2051       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2052        */
2053       brw_push_insn_state(p);
2054       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2055       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2056               retype(reg, BRW_REGISTER_TYPE_UD));
2057       brw_pop_insn_state(p);
2058    }
2059
2060 }
2061
2062 /* All these variables are pretty confusing - we might be better off
2063  * using bitmasks and macros for this, in the old style.  Or perhaps
2064  * just having the caller instantiate the fields in dword3 itself.
2065  */
2066 void brw_urb_WRITE(struct brw_compile *p,
2067                    struct brw_reg dest,
2068                    GLuint msg_reg_nr,
2069                    struct brw_reg src0,
2070                    GLboolean allocate,
2071                    GLboolean used,
2072                    GLuint msg_length,
2073                    GLuint response_length,
2074                    GLboolean eot,
2075                    GLboolean writes_complete,
2076                    GLuint offset,
2077                    GLuint swizzle)
2078 {
2079    struct intel_context *intel = &p->brw->intel;
2080    struct brw_instruction *insn;
2081
2082    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2083
2084    insn = next_insn(p, BRW_OPCODE_SEND);
2085
2086    assert(msg_length < BRW_MAX_MRF);
2087
2088    brw_set_dest(p, insn, dest);
2089    brw_set_src0(insn, src0);
2090    brw_set_src1(insn, brw_imm_d(0));
2091
2092    if (intel->gen < 6)
2093       insn->header.destreg__conditionalmod = msg_reg_nr;
2094
2095    brw_set_urb_message(p->brw,
2096                        insn,
2097                        allocate,
2098                        used,
2099                        msg_length,
2100                        response_length,
2101                        eot,
2102                        writes_complete,
2103                        offset,
2104                        swizzle);
2105 }
2106
2107 static int
2108 brw_find_next_block_end(struct brw_compile *p, int start)
2109 {
2110    int ip;
2111
2112    for (ip = start + 1; ip < p->nr_insn; ip++) {
2113       struct brw_instruction *insn = &p->store[ip];
2114
2115       switch (insn->header.opcode) {
2116       case BRW_OPCODE_ENDIF:
2117       case BRW_OPCODE_ELSE:
2118       case BRW_OPCODE_WHILE:
2119          return ip;
2120       }
2121    }
2122    assert(!"not reached");
2123    return start + 1;
2124 }
2125
2126 /* There is no DO instruction on gen6, so to find the end of the loop
2127  * we have to see if the loop is jumping back before our start
2128  * instruction.
2129  */
2130 static int
2131 brw_find_loop_end(struct brw_compile *p, int start)
2132 {
2133    int ip;
2134    int br = 2;
2135
2136    for (ip = start + 1; ip < p->nr_insn; ip++) {
2137       struct brw_instruction *insn = &p->store[ip];
2138
2139       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2140          if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2141             return ip;
2142       }
2143    }
2144    assert(!"not reached");
2145    return start + 1;
2146 }
2147
2148 /* After program generation, go back and update the UIP and JIP of
2149  * BREAK and CONT instructions to their correct locations.
2150  */
2151 void
2152 brw_set_uip_jip(struct brw_compile *p)
2153 {
2154    struct intel_context *intel = &p->brw->intel;
2155    int ip;
2156    int br = 2;
2157
2158    if (intel->gen < 6)
2159       return;
2160
2161    for (ip = 0; ip < p->nr_insn; ip++) {
2162       struct brw_instruction *insn = &p->store[ip];
2163
2164       switch (insn->header.opcode) {
2165       case BRW_OPCODE_BREAK:
2166          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2167          insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2168          break;
2169       case BRW_OPCODE_CONTINUE:
2170          /* JIP is set at CONTINUE emit time, since that's when we
2171           * know where the start of the loop is.
2172           */
2173          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2174          assert(insn->bits3.break_cont.uip != 0);
2175          assert(insn->bits3.break_cont.jip != 0);
2176          break;
2177       }
2178    }
2179 }
2180
2181 void brw_ff_sync(struct brw_compile *p,
2182                    struct brw_reg dest,
2183                    GLuint msg_reg_nr,
2184                    struct brw_reg src0,
2185                    GLboolean allocate,
2186                    GLuint response_length,
2187                    GLboolean eot)
2188 {
2189    struct intel_context *intel = &p->brw->intel;
2190    struct brw_instruction *insn;
2191
2192    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2193
2194    insn = next_insn(p, BRW_OPCODE_SEND);
2195    brw_set_dest(p, insn, dest);
2196    brw_set_src0(insn, src0);
2197    brw_set_src1(insn, brw_imm_d(0));
2198
2199    if (intel->gen < 6)
2200        insn->header.destreg__conditionalmod = msg_reg_nr;
2201
2202    brw_set_ff_sync_message(p->brw,
2203                            insn,
2204                            allocate,
2205                            response_length,
2206                            eot);
2207 }