src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the BSpec / ISA Reference / send - [DevIVB+]:
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct intel_context *intel = &p->brw->intel;
  96    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* even ignored in da16, still need to set as '01' */
 130          insn->bits1.da16.dest_horiz_stride = 1;
 131       }
 132    }
 133    else {
 134       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 135
 136       /* These are different sizes in align1 vs align16:
 137        */
 138       if (insn->header.access_mode == BRW_ALIGN_1) {
 139          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 140          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 141             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 142          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 143       }
 144       else {
 145          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 146          /* even ignored in da16, still need to set as '01' */
 147          insn->bits1.ia16.dest_horiz_stride = 1;
 148       }
 149    }
 150
 151    /* NEW: Set the execution size based on dest.width and
 152     * insn->compression_control:
 153     */
 154    guess_execution_size(p, insn, dest);
 155 }
 156
 157 extern int reg_type_size[];
 158
 159 static void
 160 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 161 {
 162    int hstride_for_reg[] = {0, 1, 2, 4};
 163    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 164    int width_for_reg[] = {1, 2, 4, 8, 16};
 165    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 166    int width, hstride, vstride, execsize;
 167
 168    if (reg.file == BRW_IMMEDIATE_VALUE) {
 169       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 170        * mean the destination has to be 128-bit aligned and the
 171        * destination horiz stride has to be a word.
 172        */
 173       if (reg.type == BRW_REGISTER_TYPE_V) {
 174          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 175                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 176       }
 177
 178       return;
 179    }
 180
 181    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 182        reg.file == BRW_ARF_NULL)
 183       return;
 184
 185    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 186    hstride = hstride_for_reg[reg.hstride];
 187
 188    if (reg.vstride == 0xf) {
 189       vstride = -1;
 190    } else {
 191       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 192       vstride = vstride_for_reg[reg.vstride];
 193    }
 194
 195    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 196    width = width_for_reg[reg.width];
 197
 198    assert(insn->header.execution_size >= 0 &&
 199           insn->header.execution_size < Elements(execsize_for_reg));
 200    execsize = execsize_for_reg[insn->header.execution_size];
 201
 202    /* Restrictions from 3.3.10: Register Region Restrictions. */
 203    /* 3. */
 204    assert(execsize >= width);
 205
 206    /* 4. */
 207    if (execsize == width && hstride != 0) {
 208       assert(vstride == -1 || vstride == width * hstride);
 209    }
 210
 211    /* 5. */
 212    if (execsize == width && hstride == 0) {
 213       /* no restriction on vstride. */
 214    }
 215
 216    /* 6. */
 217    if (width == 1) {
 218       assert(hstride == 0);
 219    }
 220
 221    /* 7. */
 222    if (execsize == 1 && width == 1) {
 223       assert(hstride == 0);
 224       assert(vstride == 0);
 225    }
 226
 227    /* 8. */
 228    if (vstride == 0 && hstride == 0) {
 229       assert(width == 1);
 230    }
 231
 232    /* 10. Check destination issues. */
 233 }
 234
 235 void
 236 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 237              struct brw_reg reg)
 238 {
 239    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 240       assert(reg.nr < 128);
 241
 242    gen7_convert_mrf_to_grf(p, &reg);
 243
 244    validate_reg(insn, reg);
 245
 246    insn->bits1.da1.src0_reg_file = reg.file;
 247    insn->bits1.da1.src0_reg_type = reg.type;
 248    insn->bits2.da1.src0_abs = reg.abs;
 249    insn->bits2.da1.src0_negate = reg.negate;
 250    insn->bits2.da1.src0_address_mode = reg.address_mode;
 251
 252    if (reg.file == BRW_IMMEDIATE_VALUE) {
 253       insn->bits3.ud = reg.dw1.ud;
 254
 255       /* Required to set some fields in src1 as well:
 256        */
 257       insn->bits1.da1.src1_reg_file = 0; /* arf */
 258       insn->bits1.da1.src1_reg_type = reg.type;
 259    }
 260    else
 261    {
 262       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 263          if (insn->header.access_mode == BRW_ALIGN_1) {
 264             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 265             insn->bits2.da1.src0_reg_nr = reg.nr;
 266          }
 267          else {
 268             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 269             insn->bits2.da16.src0_reg_nr = reg.nr;
 270          }
 271       }
 272       else {
 273          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 274
 275          if (insn->header.access_mode == BRW_ALIGN_1) {
 276             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 277          }
 278          else {
 279             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 280          }
 281       }
 282
 283       if (insn->header.access_mode == BRW_ALIGN_1) {
 284          if (reg.width == BRW_WIDTH_1 &&
 285              insn->header.execution_size == BRW_EXECUTE_1) {
 286             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 287             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 288             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 289          }
 290          else {
 291             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 292             insn->bits2.da1.src0_width = reg.width;
 293             insn->bits2.da1.src0_vert_stride = reg.vstride;
 294          }
 295       }
 296       else {
 297          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 298          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 299          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 300          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 301
 302          /* This is an oddity of the fact we're using the same
 303           * descriptions for registers in align_16 as align_1:
 304           */
 305          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 306             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 307          else
 308             insn->bits2.da16.src0_vert_stride = reg.vstride;
 309       }
 310    }
 311 }
 312
 313
 314 void brw_set_src1(struct brw_compile *p,
 315                   struct brw_instruction *insn,
 316                   struct brw_reg reg)
 317 {
 318    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 319
 320    assert(reg.nr < 128);
 321
 322    gen7_convert_mrf_to_grf(p, &reg);
 323
 324    validate_reg(insn, reg);
 325
 326    insn->bits1.da1.src1_reg_file = reg.file;
 327    insn->bits1.da1.src1_reg_type = reg.type;
 328    insn->bits3.da1.src1_abs = reg.abs;
 329    insn->bits3.da1.src1_negate = reg.negate;
 330
 331    /* Only src1 can be immediate in two-argument instructions.
 332     */
 333    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 334
 335    if (reg.file == BRW_IMMEDIATE_VALUE) {
 336       insn->bits3.ud = reg.dw1.ud;
 337    }
 338    else {
 339       /* This is a hardware restriction, which may or may not be lifted
 340        * in the future:
 341        */
 342       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 343       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 344
 345       if (insn->header.access_mode == BRW_ALIGN_1) {
 346          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 347          insn->bits3.da1.src1_reg_nr = reg.nr;
 348       }
 349       else {
 350          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 351          insn->bits3.da16.src1_reg_nr = reg.nr;
 352       }
 353
 354       if (insn->header.access_mode == BRW_ALIGN_1) {
 355          if (reg.width == BRW_WIDTH_1 &&
 356              insn->header.execution_size == BRW_EXECUTE_1) {
 357             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 358             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 359             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 360          }
 361          else {
 362             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 363             insn->bits3.da1.src1_width = reg.width;
 364             insn->bits3.da1.src1_vert_stride = reg.vstride;
 365          }
 366       }
 367       else {
 368          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 369          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 370          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 371          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 372
 373          /* This is an oddity of the fact we're using the same
 374           * descriptions for registers in align_16 as align_1:
 375           */
 376          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 377             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 378          else
 379             insn->bits3.da16.src1_vert_stride = reg.vstride;
 380       }
 381    }
 382 }
 383
 384 /**
 385  * Set the Message Descriptor and Extended Message Descriptor fields
 386  * for SEND messages.
 387  *
 388  * \note This zeroes out the Function Control bits, so it must be called
 389  *       \b before filling out any message-specific data.  Callers can
 390  *       choose not to fill in irrelevant bits; they will be zero.
 391  */
 392 static void
 393 brw_set_message_descriptor(struct brw_compile *p,
 394                            struct brw_instruction *inst,
 395                            enum brw_message_target sfid,
 396                            unsigned msg_length,
 397                            unsigned response_length,
 398                            bool header_present,
 399                            bool end_of_thread)
 400 {
 401    struct intel_context *intel = &p->brw->intel;
 402
 403    brw_set_src1(p, inst, brw_imm_d(0));
 404
 405    if (intel->gen >= 5) {
 406       inst->bits3.generic_gen5.header_present = header_present;
 407       inst->bits3.generic_gen5.response_length = response_length;
 408       inst->bits3.generic_gen5.msg_length = msg_length;
 409       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 410
 411       if (intel->gen >= 6) {
 412          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 413          inst->header.destreg__conditionalmod = sfid;
 414       } else {
 415          /* Set Extended Message Descriptor (ex_desc) */
 416          inst->bits2.send_gen5.sfid = sfid;
 417          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 418       }
 419    } else {
 420       inst->bits3.generic.response_length = response_length;
 421       inst->bits3.generic.msg_length = msg_length;
 422       inst->bits3.generic.msg_target = sfid;
 423       inst->bits3.generic.end_of_thread = end_of_thread;
 424    }
 425 }
 426
 427 static void brw_set_math_message( struct brw_compile *p,
 428                                   struct brw_instruction *insn,
 429                                   GLuint function,
 430                                   GLuint integer_type,
 431                                   bool low_precision,
 432                                   bool saturate,
 433                                   GLuint dataType )
 434 {
 435    struct brw_context *brw = p->brw;
 436    struct intel_context *intel = &brw->intel;
 437    unsigned msg_length;
 438    unsigned response_length;
 439
 440    /* Infer message length from the function */
 441    switch (function) {
 442    case BRW_MATH_FUNCTION_POW:
 443    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 444    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 445    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 446       msg_length = 2;
 447       break;
 448    default:
 449       msg_length = 1;
 450       break;
 451    }
 452
 453    /* Infer response length from the function */
 454    switch (function) {
 455    case BRW_MATH_FUNCTION_SINCOS:
 456    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 457       response_length = 2;
 458       break;
 459    default:
 460       response_length = 1;
 461       break;
 462    }
 463
 464    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 465                               msg_length, response_length, false, false);
 466    if (intel->gen == 5) {
 467       insn->bits3.math_gen5.function = function;
 468       insn->bits3.math_gen5.int_type = integer_type;
 469       insn->bits3.math_gen5.precision = low_precision;
 470       insn->bits3.math_gen5.saturate = saturate;
 471       insn->bits3.math_gen5.data_type = dataType;
 472       insn->bits3.math_gen5.snapshot = 0;
 473    } else {
 474       insn->bits3.math.function = function;
 475       insn->bits3.math.int_type = integer_type;
 476       insn->bits3.math.precision = low_precision;
 477       insn->bits3.math.saturate = saturate;
 478       insn->bits3.math.data_type = dataType;
 479    }
 480 }
 481
 482
 483 static void brw_set_ff_sync_message(struct brw_compile *p,
 484                                     struct brw_instruction *insn,
 485                                     bool allocate,
 486                                     GLuint response_length,
 487                                     bool end_of_thread)
 488 {
 489    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 490                               1, response_length, true, end_of_thread);
 491    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 492    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 493    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 494    insn->bits3.urb_gen5.allocate = allocate;
 495    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 496    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 497 }
 498
 499 static void brw_set_urb_message( struct brw_compile *p,
 500                                  struct brw_instruction *insn,
 501                                  bool allocate,
 502                                  bool used,
 503                                  GLuint msg_length,
 504                                  GLuint response_length,
 505                                  bool end_of_thread,
 506                                  bool complete,
 507                                  GLuint offset,
 508                                  GLuint swizzle_control )
 509 {
 510    struct brw_context *brw = p->brw;
 511    struct intel_context *intel = &brw->intel;
 512
 513    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 514                               msg_length, response_length, true, end_of_thread);
 515    if (intel->gen == 7) {
 516       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 517       insn->bits3.urb_gen7.offset = offset;
 518       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 519       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 520       /* per_slot_offset = 0 makes it ignore offsets in message header */
 521       insn->bits3.urb_gen7.per_slot_offset = 0;
 522       insn->bits3.urb_gen7.complete = complete;
 523    } else if (intel->gen >= 5) {
 524       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 525       insn->bits3.urb_gen5.offset = offset;
 526       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 527       insn->bits3.urb_gen5.allocate = allocate;
 528       insn->bits3.urb_gen5.used = used; /* ? */
 529       insn->bits3.urb_gen5.complete = complete;
 530    } else {
 531       insn->bits3.urb.opcode = 0;       /* ? */
 532       insn->bits3.urb.offset = offset;
 533       insn->bits3.urb.swizzle_control = swizzle_control;
 534       insn->bits3.urb.allocate = allocate;
 535       insn->bits3.urb.used = used;      /* ? */
 536       insn->bits3.urb.complete = complete;
 537    }
 538 }
 539
 540 void
 541 brw_set_dp_write_message(struct brw_compile *p,
 542                          struct brw_instruction *insn,
 543                          GLuint binding_table_index,
 544                          GLuint msg_control,
 545                          GLuint msg_type,
 546                          GLuint msg_length,
 547                          bool header_present,
 548                          GLuint last_render_target,
 549                          GLuint response_length,
 550                          GLuint end_of_thread,
 551                          GLuint send_commit_msg)
 552 {
 553    struct brw_context *brw = p->brw;
 554    struct intel_context *intel = &brw->intel;
 555    unsigned sfid;
 556
 557    if (intel->gen >= 7) {
 558       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 559       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 560          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 561       else
 562          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 563    } else if (intel->gen == 6) {
 564       /* Use the render cache for all write messages. */
 565       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 566    } else {
 567       sfid = BRW_SFID_DATAPORT_WRITE;
 568    }
 569
 570    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 571                               header_present, end_of_thread);
 572
 573    if (intel->gen >= 7) {
 574       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 575       insn->bits3.gen7_dp.msg_control = msg_control;
 576       insn->bits3.gen7_dp.last_render_target = last_render_target;
 577       insn->bits3.gen7_dp.msg_type = msg_type;
 578    } else if (intel->gen == 6) {
 579       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 580       insn->bits3.gen6_dp.msg_control = msg_control;
 581       insn->bits3.gen6_dp.last_render_target = last_render_target;
 582       insn->bits3.gen6_dp.msg_type = msg_type;
 583       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 584    } else if (intel->gen == 5) {
 585       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 586       insn->bits3.dp_write_gen5.msg_control = msg_control;
 587       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 588       insn->bits3.dp_write_gen5.msg_type = msg_type;
 589       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 590    } else {
 591       insn->bits3.dp_write.binding_table_index = binding_table_index;
 592       insn->bits3.dp_write.msg_control = msg_control;
 593       insn->bits3.dp_write.last_render_target = last_render_target;
 594       insn->bits3.dp_write.msg_type = msg_type;
 595       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 596    }
 597 }
 598
 599 void
 600 brw_set_dp_read_message(struct brw_compile *p,
 601                         struct brw_instruction *insn,
 602                         GLuint binding_table_index,
 603                         GLuint msg_control,
 604                         GLuint msg_type,
 605                         GLuint target_cache,
 606                         GLuint msg_length,
 607                         GLuint response_length)
 608 {
 609    struct brw_context *brw = p->brw;
 610    struct intel_context *intel = &brw->intel;
 611    unsigned sfid;
 612
 613    if (intel->gen >= 7) {
 614       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 615    } else if (intel->gen == 6) {
 616       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 617          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 618       else
 619          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 620    } else {
 621       sfid = BRW_SFID_DATAPORT_READ;
 622    }
 623
 624    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 625                               true, false);
 626
 627    if (intel->gen >= 7) {
 628       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 629       insn->bits3.gen7_dp.msg_control = msg_control;
 630       insn->bits3.gen7_dp.last_render_target = 0;
 631       insn->bits3.gen7_dp.msg_type = msg_type;
 632    } else if (intel->gen == 6) {
 633       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 634       insn->bits3.gen6_dp.msg_control = msg_control;
 635       insn->bits3.gen6_dp.last_render_target = 0;
 636       insn->bits3.gen6_dp.msg_type = msg_type;
 637       insn->bits3.gen6_dp.send_commit_msg = 0;
 638    } else if (intel->gen == 5) {
 639       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 640       insn->bits3.dp_read_gen5.msg_control = msg_control;
 641       insn->bits3.dp_read_gen5.msg_type = msg_type;
 642       insn->bits3.dp_read_gen5.target_cache = target_cache;
 643    } else if (intel->is_g4x) {
 644       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 645       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 646       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 647       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 648    } else {
 649       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 650       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 651       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 652       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 653    }
 654 }
 655
 656 void
 657 brw_set_sampler_message(struct brw_compile *p,
 658                         struct brw_instruction *insn,
 659                         GLuint binding_table_index,
 660                         GLuint sampler,
 661                         GLuint msg_type,
 662                         GLuint response_length,
 663                         GLuint msg_length,
 664                         GLuint header_present,
 665                         GLuint simd_mode,
 666                         GLuint return_format)
 667 {
 668    struct brw_context *brw = p->brw;
 669    struct intel_context *intel = &brw->intel;
 670
 671    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 672                               response_length, header_present, false);
 673
 674    if (intel->gen >= 7) {
 675       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 676       insn->bits3.sampler_gen7.sampler = sampler;
 677       insn->bits3.sampler_gen7.msg_type = msg_type;
 678       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 679    } else if (intel->gen >= 5) {
 680       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 681       insn->bits3.sampler_gen5.sampler = sampler;
 682       insn->bits3.sampler_gen5.msg_type = msg_type;
 683       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 684    } else if (intel->is_g4x) {
 685       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 686       insn->bits3.sampler_g4x.sampler = sampler;
 687       insn->bits3.sampler_g4x.msg_type = msg_type;
 688    } else {
 689       insn->bits3.sampler.binding_table_index = binding_table_index;
 690       insn->bits3.sampler.sampler = sampler;
 691       insn->bits3.sampler.msg_type = msg_type;
 692       insn->bits3.sampler.return_format = return_format;
 693    }
 694 }
 695
 696
 697 #define next_insn brw_next_insn
 698 struct brw_instruction *
 699 brw_next_insn(struct brw_compile *p, GLuint opcode)
 700 {
 701    struct brw_instruction *insn;
 702
 703    if (p->nr_insn + 1 > p->store_size) {
 704       if (0)
 705          printf("incresing the store size to %d\n", p->store_size << 1);
 706       p->store_size <<= 1;
 707       p->store = reralloc(p->mem_ctx, p->store,
 708                           struct brw_instruction, p->store_size);
 709       if (!p->store)
 710          assert(!"realloc eu store memeory failed");
 711    }
 712
 713    insn = &p->store[p->nr_insn++];
 714    memcpy(insn, p->current, sizeof(*insn));
 715
 716    /* Reset this one-shot flag:
 717     */
 718
 719    if (p->current->header.destreg__conditionalmod) {
 720       p->current->header.destreg__conditionalmod = 0;
 721       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 722    }
 723
 724    insn->header.opcode = opcode;
 725    return insn;
 726 }
 727
 728 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 729                                          GLuint opcode,
 730                                          struct brw_reg dest,
 731                                          struct brw_reg src )
 732 {
 733    struct brw_instruction *insn = next_insn(p, opcode);
 734    brw_set_dest(p, insn, dest);
 735    brw_set_src0(p, insn, src);
 736    return insn;
 737 }
 738
 739 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 740                                         GLuint opcode,
 741                                         struct brw_reg dest,
 742                                         struct brw_reg src0,
 743                                         struct brw_reg src1 )
 744 {
 745    struct brw_instruction *insn = next_insn(p, opcode);
 746    brw_set_dest(p, insn, dest);
 747    brw_set_src0(p, insn, src0);
 748    brw_set_src1(p, insn, src1);
 749    return insn;
 750 }
 751
 752 static int
 753 get_3src_subreg_nr(struct brw_reg reg)
 754 {
 755    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 756       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 757       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 758    } else {
 759       return reg.subnr / 4;
 760    }
 761 }
 762
 763 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 764                                         GLuint opcode,
 765                                         struct brw_reg dest,
 766                                         struct brw_reg src0,
 767                                         struct brw_reg src1,
 768                                         struct brw_reg src2)
 769 {
 770    struct brw_instruction *insn = next_insn(p, opcode);
 771
 772    gen7_convert_mrf_to_grf(p, &dest);
 773
 774    assert(insn->header.access_mode == BRW_ALIGN_16);
 775
 776    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 777           dest.file == BRW_MESSAGE_REGISTER_FILE);
 778    assert(dest.nr < 128);
 779    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 780    assert(dest.type = BRW_REGISTER_TYPE_F);
 781    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 782    insn->bits1.da3src.dest_reg_nr = dest.nr;
 783    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 784    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 785    guess_execution_size(p, insn, dest);
 786
 787    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 788    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 789    assert(src0.nr < 128);
 790    assert(src0.type == BRW_REGISTER_TYPE_F);
 791    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 792    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 793    insn->bits2.da3src.src0_reg_nr = src0.nr;
 794    insn->bits1.da3src.src0_abs = src0.abs;
 795    insn->bits1.da3src.src0_negate = src0.negate;
 796    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 797
 798    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 799    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 800    assert(src1.nr < 128);
 801    assert(src1.type == BRW_REGISTER_TYPE_F);
 802    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 803    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 804    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 805    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 806    insn->bits3.da3src.src1_reg_nr = src1.nr;
 807    insn->bits1.da3src.src1_abs = src1.abs;
 808    insn->bits1.da3src.src1_negate = src1.negate;
 809
 810    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 811    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 812    assert(src2.nr < 128);
 813    assert(src2.type == BRW_REGISTER_TYPE_F);
 814    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 815    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 816    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 817    insn->bits3.da3src.src2_reg_nr = src2.nr;
 818    insn->bits1.da3src.src2_abs = src2.abs;
 819    insn->bits1.da3src.src2_negate = src2.negate;
 820
 821    return insn;
 822 }
 823
 824
 825 /***********************************************************************
 826  * Convenience routines.
 827  */
 828 #define ALU1(OP)                                        \
 829 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 830               struct brw_reg dest,                      \
 831               struct brw_reg src0)                      \
 832 {                                                       \
 833    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 834 }
 835
 836 #define ALU2(OP)                                        \
 837 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 838               struct brw_reg dest,                      \
 839               struct brw_reg src0,                      \
 840               struct brw_reg src1)                      \
 841 {                                                       \
 842    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 843 }
 844
 845 #define ALU3(OP)                                        \
 846 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 847               struct brw_reg dest,                      \
 848               struct brw_reg src0,                      \
 849               struct brw_reg src1,                      \
 850               struct brw_reg src2)                      \
 851 {                                                       \
 852    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 853 }
 854
 855 /* Rounding operations (other than RNDD) require two instructions - the first
 856  * stores a rounded value (possibly the wrong way) in the dest register, but
 857  * also sets a per-channel "increment bit" in the flag register.  A predicated
 858  * add of 1.0 fixes dest to contain the desired result.
 859  *
 860  * Sandybridge and later appear to round correctly without an ADD.
 861  */
 862 #define ROUND(OP)                                                             \
 863 void brw_##OP(struct brw_compile *p,                                          \
 864               struct brw_reg dest,                                            \
 865               struct brw_reg src)                                             \
 866 {                                                                             \
 867    struct brw_instruction *rnd, *add;                                         \
 868    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 869    brw_set_dest(p, rnd, dest);                                                \
 870    brw_set_src0(p, rnd, src);                                                 \
 871                                                                               \
 872    if (p->brw->intel.gen < 6) {                                               \
 873       /* turn on round-increments */                                          \
 874       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 875       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 876       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 877    }                                                                          \
 878 }
 879
 880
 881 ALU1(MOV)
 882 ALU2(SEL)
 883 ALU1(NOT)
 884 ALU2(AND)
 885 ALU2(OR)
 886 ALU2(XOR)
 887 ALU2(SHR)
 888 ALU2(SHL)
 889 ALU2(RSR)
 890 ALU2(RSL)
 891 ALU2(ASR)
 892 ALU1(FRC)
 893 ALU1(RNDD)
 894 ALU2(MAC)
 895 ALU2(MACH)
 896 ALU1(LZD)
 897 ALU2(DP4)
 898 ALU2(DPH)
 899 ALU2(DP3)
 900 ALU2(DP2)
 901 ALU2(LINE)
 902 ALU2(PLN)
 903 ALU3(MAD)
 904
 905 ROUND(RNDZ)
 906 ROUND(RNDE)
 907
 908
 909 struct brw_instruction *brw_ADD(struct brw_compile *p,
 910                                 struct brw_reg dest,
 911                                 struct brw_reg src0,
 912                                 struct brw_reg src1)
 913 {
 914    /* 6.2.2: add */
 915    if (src0.type == BRW_REGISTER_TYPE_F ||
 916        (src0.file == BRW_IMMEDIATE_VALUE &&
 917         src0.type == BRW_REGISTER_TYPE_VF)) {
 918       assert(src1.type != BRW_REGISTER_TYPE_UD);
 919       assert(src1.type != BRW_REGISTER_TYPE_D);
 920    }
 921
 922    if (src1.type == BRW_REGISTER_TYPE_F ||
 923        (src1.file == BRW_IMMEDIATE_VALUE &&
 924         src1.type == BRW_REGISTER_TYPE_VF)) {
 925       assert(src0.type != BRW_REGISTER_TYPE_UD);
 926       assert(src0.type != BRW_REGISTER_TYPE_D);
 927    }
 928
 929    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 930 }
 931
 932 struct brw_instruction *brw_AVG(struct brw_compile *p,
 933                                 struct brw_reg dest,
 934                                 struct brw_reg src0,
 935                                 struct brw_reg src1)
 936 {
 937    assert(dest.type == src0.type);
 938    assert(src0.type == src1.type);
 939    switch (src0.type) {
 940    case BRW_REGISTER_TYPE_B:
 941    case BRW_REGISTER_TYPE_UB:
 942    case BRW_REGISTER_TYPE_W:
 943    case BRW_REGISTER_TYPE_UW:
 944    case BRW_REGISTER_TYPE_D:
 945    case BRW_REGISTER_TYPE_UD:
 946       break;
 947    default:
 948       assert(!"Bad type for brw_AVG");
 949    }
 950
 951    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
 952 }
 953
 954 struct brw_instruction *brw_MUL(struct brw_compile *p,
 955                                 struct brw_reg dest,
 956                                 struct brw_reg src0,
 957                                 struct brw_reg src1)
 958 {
 959    /* 6.32.38: mul */
 960    if (src0.type == BRW_REGISTER_TYPE_D ||
 961        src0.type == BRW_REGISTER_TYPE_UD ||
 962        src1.type == BRW_REGISTER_TYPE_D ||
 963        src1.type == BRW_REGISTER_TYPE_UD) {
 964       assert(dest.type != BRW_REGISTER_TYPE_F);
 965    }
 966
 967    if (src0.type == BRW_REGISTER_TYPE_F ||
 968        (src0.file == BRW_IMMEDIATE_VALUE &&
 969         src0.type == BRW_REGISTER_TYPE_VF)) {
 970       assert(src1.type != BRW_REGISTER_TYPE_UD);
 971       assert(src1.type != BRW_REGISTER_TYPE_D);
 972    }
 973
 974    if (src1.type == BRW_REGISTER_TYPE_F ||
 975        (src1.file == BRW_IMMEDIATE_VALUE &&
 976         src1.type == BRW_REGISTER_TYPE_VF)) {
 977       assert(src0.type != BRW_REGISTER_TYPE_UD);
 978       assert(src0.type != BRW_REGISTER_TYPE_D);
 979    }
 980
 981    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 982           src0.nr != BRW_ARF_ACCUMULATOR);
 983    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 984           src1.nr != BRW_ARF_ACCUMULATOR);
 985
 986    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
 987 }
 988
 989
 990 void brw_NOP(struct brw_compile *p)
 991 {
 992    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
 993    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 994    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 995    brw_set_src1(p, insn, brw_imm_ud(0x0));
 996 }
 997
 998
 999
1000
1001
1002 /***********************************************************************
1003  * Comparisons, if/else/endif
1004  */
1005
1006 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1007                                  struct brw_reg dest,
1008                                  struct brw_reg src0,
1009                                  struct brw_reg src1)
1010 {
1011    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1012
1013    insn->header.execution_size = 1;
1014    insn->header.compression_control = BRW_COMPRESSION_NONE;
1015    insn->header.mask_control = BRW_MASK_DISABLE;
1016
1017    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1018
1019    return insn;
1020 }
1021
1022 static void
1023 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1024 {
1025    p->if_stack[p->if_stack_depth] = inst - p->store;
1026
1027    p->if_stack_depth++;
1028    if (p->if_stack_array_size <= p->if_stack_depth) {
1029       p->if_stack_array_size *= 2;
1030       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1031                              p->if_stack_array_size);
1032    }
1033 }
1034
1035 static struct brw_instruction *
1036 pop_if_stack(struct brw_compile *p)
1037 {
1038    p->if_stack_depth--;
1039    return &p->store[p->if_stack[p->if_stack_depth]];
1040 }
1041
1042 static void
1043 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1044 {
1045    if (p->loop_stack_array_size < p->loop_stack_depth) {
1046       p->loop_stack_array_size *= 2;
1047       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1048                                p->loop_stack_array_size);
1049       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1050                                      p->loop_stack_array_size);
1051    }
1052
1053    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1054    p->loop_stack_depth++;
1055    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1056 }
1057
1058 static struct brw_instruction *
1059 get_inner_do_insn(struct brw_compile *p)
1060 {
1061    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1062 }
1063
1064 /* EU takes the value from the flag register and pushes it onto some
1065  * sort of a stack (presumably merging with any flag value already on
1066  * the stack).  Within an if block, the flags at the top of the stack
1067  * control execution on each channel of the unit, eg. on each of the
1068  * 16 pixel values in our wm programs.
1069  *
1070  * When the matching 'else' instruction is reached (presumably by
1071  * countdown of the instruction count patched in by our ELSE/ENDIF
1072  * functions), the relevent flags are inverted.
1073  *
1074  * When the matching 'endif' instruction is reached, the flags are
1075  * popped off.  If the stack is now empty, normal execution resumes.
1076  */
1077 struct brw_instruction *
1078 brw_IF(struct brw_compile *p, GLuint execute_size)
1079 {
1080    struct intel_context *intel = &p->brw->intel;
1081    struct brw_instruction *insn;
1082
1083    insn = next_insn(p, BRW_OPCODE_IF);
1084
1085    /* Override the defaults for this instruction:
1086     */
1087    if (intel->gen < 6) {
1088       brw_set_dest(p, insn, brw_ip_reg());
1089       brw_set_src0(p, insn, brw_ip_reg());
1090       brw_set_src1(p, insn, brw_imm_d(0x0));
1091    } else if (intel->gen == 6) {
1092       brw_set_dest(p, insn, brw_imm_w(0));
1093       insn->bits1.branch_gen6.jump_count = 0;
1094       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1095       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1096    } else {
1097       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1098       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1099       brw_set_src1(p, insn, brw_imm_ud(0));
1100       insn->bits3.break_cont.jip = 0;
1101       insn->bits3.break_cont.uip = 0;
1102    }
1103
1104    insn->header.execution_size = execute_size;
1105    insn->header.compression_control = BRW_COMPRESSION_NONE;
1106    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1107    insn->header.mask_control = BRW_MASK_ENABLE;
1108    if (!p->single_program_flow)
1109       insn->header.thread_control = BRW_THREAD_SWITCH;
1110
1111    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1112
1113    push_if_stack(p, insn);
1114    p->if_depth_in_loop[p->loop_stack_depth]++;
1115    return insn;
1116 }
1117
1118 /* This function is only used for gen6-style IF instructions with an
1119  * embedded comparison (conditional modifier).  It is not used on gen7.
1120  */
1121 struct brw_instruction *
1122 gen6_IF(struct brw_compile *p, uint32_t conditional,
1123         struct brw_reg src0, struct brw_reg src1)
1124 {
1125    struct brw_instruction *insn;
1126
1127    insn = next_insn(p, BRW_OPCODE_IF);
1128
1129    brw_set_dest(p, insn, brw_imm_w(0));
1130    if (p->compressed) {
1131       insn->header.execution_size = BRW_EXECUTE_16;
1132    } else {
1133       insn->header.execution_size = BRW_EXECUTE_8;
1134    }
1135    insn->bits1.branch_gen6.jump_count = 0;
1136    brw_set_src0(p, insn, src0);
1137    brw_set_src1(p, insn, src1);
1138
1139    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1140    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1141    insn->header.destreg__conditionalmod = conditional;
1142
1143    if (!p->single_program_flow)
1144       insn->header.thread_control = BRW_THREAD_SWITCH;
1145
1146    push_if_stack(p, insn);
1147    return insn;
1148 }
1149
1150 /**
1151  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1152  */
1153 static void
1154 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1155                        struct brw_instruction *if_inst,
1156                        struct brw_instruction *else_inst)
1157 {
1158    /* The next instruction (where the ENDIF would be, if it existed) */
1159    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1160
1161    assert(p->single_program_flow);
1162    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1163    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1164    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1165
1166    /* Convert IF to an ADD instruction that moves the instruction pointer
1167     * to the first instruction of the ELSE block.  If there is no ELSE
1168     * block, point to where ENDIF would be.  Reverse the predicate.
1169     *
1170     * There's no need to execute an ENDIF since we don't need to do any
1171     * stack operations, and if we're currently executing, we just want to
1172     * continue normally.
1173     */
1174    if_inst->header.opcode = BRW_OPCODE_ADD;
1175    if_inst->header.predicate_inverse = 1;
1176
1177    if (else_inst != NULL) {
1178       /* Convert ELSE to an ADD instruction that points where the ENDIF
1179        * would be.
1180        */
1181       else_inst->header.opcode = BRW_OPCODE_ADD;
1182
1183       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1184       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1185    } else {
1186       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1187    }
1188 }
1189
1190 /**
1191  * Patch IF and ELSE instructions with appropriate jump targets.
1192  */
1193 static void
1194 patch_IF_ELSE(struct brw_compile *p,
1195               struct brw_instruction *if_inst,
1196               struct brw_instruction *else_inst,
1197               struct brw_instruction *endif_inst)
1198 {
1199    struct intel_context *intel = &p->brw->intel;
1200
1201    /* We shouldn't be patching IF and ELSE instructions in single program flow
1202     * mode when gen < 6, because in single program flow mode on those
1203     * platforms, we convert flow control instructions to conditional ADDs that
1204     * operate on IP (see brw_ENDIF).
1205     *
1206     * However, on Gen6, writing to IP doesn't work in single program flow mode
1207     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1208     * not be updated by non-flow control instructions.").  And on later
1209     * platforms, there is no significant benefit to converting control flow
1210     * instructions to conditional ADDs.  So we do patch IF and ELSE
1211     * instructions in single program flow mode on those platforms.
1212     */
1213    if (intel->gen < 6)
1214       assert(!p->single_program_flow);
1215
1216    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1217    assert(endif_inst != NULL);
1218    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1219
1220    unsigned br = 1;
1221    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1222     * requires 2 chunks.
1223     */
1224    if (intel->gen >= 5)
1225       br = 2;
1226
1227    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1228    endif_inst->header.execution_size = if_inst->header.execution_size;
1229
1230    if (else_inst == NULL) {
1231       /* Patch IF -> ENDIF */
1232       if (intel->gen < 6) {
1233          /* Turn it into an IFF, which means no mask stack operations for
1234           * all-false and jumping past the ENDIF.
1235           */
1236          if_inst->header.opcode = BRW_OPCODE_IFF;
1237          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1238          if_inst->bits3.if_else.pop_count = 0;
1239          if_inst->bits3.if_else.pad0 = 0;
1240       } else if (intel->gen == 6) {
1241          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1242          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1243       } else {
1244          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1245          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1246       }
1247    } else {
1248       else_inst->header.execution_size = if_inst->header.execution_size;
1249
1250       /* Patch IF -> ELSE */
1251       if (intel->gen < 6) {
1252          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1253          if_inst->bits3.if_else.pop_count = 0;
1254          if_inst->bits3.if_else.pad0 = 0;
1255       } else if (intel->gen == 6) {
1256          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1257       }
1258
1259       /* Patch ELSE -> ENDIF */
1260       if (intel->gen < 6) {
1261          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1262           * matching ENDIF.
1263           */
1264          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1265          else_inst->bits3.if_else.pop_count = 1;
1266          else_inst->bits3.if_else.pad0 = 0;
1267       } else if (intel->gen == 6) {
1268          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1269          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1270       } else {
1271          /* The IF instruction's JIP should point just past the ELSE */
1272          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1273          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1274          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1275          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1276       }
1277    }
1278 }
1279
1280 void
1281 brw_ELSE(struct brw_compile *p)
1282 {
1283    struct intel_context *intel = &p->brw->intel;
1284    struct brw_instruction *insn;
1285
1286    insn = next_insn(p, BRW_OPCODE_ELSE);
1287
1288    if (intel->gen < 6) {
1289       brw_set_dest(p, insn, brw_ip_reg());
1290       brw_set_src0(p, insn, brw_ip_reg());
1291       brw_set_src1(p, insn, brw_imm_d(0x0));
1292    } else if (intel->gen == 6) {
1293       brw_set_dest(p, insn, brw_imm_w(0));
1294       insn->bits1.branch_gen6.jump_count = 0;
1295       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1296       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1297    } else {
1298       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1299       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1300       brw_set_src1(p, insn, brw_imm_ud(0));
1301       insn->bits3.break_cont.jip = 0;
1302       insn->bits3.break_cont.uip = 0;
1303    }
1304
1305    insn->header.compression_control = BRW_COMPRESSION_NONE;
1306    insn->header.mask_control = BRW_MASK_ENABLE;
1307    if (!p->single_program_flow)
1308       insn->header.thread_control = BRW_THREAD_SWITCH;
1309
1310    push_if_stack(p, insn);
1311 }
1312
1313 void
1314 brw_ENDIF(struct brw_compile *p)
1315 {
1316    struct intel_context *intel = &p->brw->intel;
1317    struct brw_instruction *insn = NULL;
1318    struct brw_instruction *else_inst = NULL;
1319    struct brw_instruction *if_inst = NULL;
1320    struct brw_instruction *tmp;
1321    bool emit_endif = true;
1322
1323    /* In single program flow mode, we can express IF and ELSE instructions
1324     * equivalently as ADD instructions that operate on IP.  On platforms prior
1325     * to Gen6, flow control instructions cause an implied thread switch, so
1326     * this is a significant savings.
1327     *
1328     * However, on Gen6, writing to IP doesn't work in single program flow mode
1329     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1330     * not be updated by non-flow control instructions.").  And on later
1331     * platforms, there is no significant benefit to converting control flow
1332     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1333     * Gen5.
1334     */
1335    if (intel->gen < 6 && p->single_program_flow)
1336       emit_endif = false;
1337
1338    /*
1339     * A single next_insn() may change the base adress of instruction store
1340     * memory(p->store), so call it first before referencing the instruction
1341     * store pointer from an index
1342     */
1343    if (emit_endif)
1344       insn = next_insn(p, BRW_OPCODE_ENDIF);
1345
1346    /* Pop the IF and (optional) ELSE instructions from the stack */
1347    p->if_depth_in_loop[p->loop_stack_depth]--;
1348    tmp = pop_if_stack(p);
1349    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1350       else_inst = tmp;
1351       tmp = pop_if_stack(p);
1352    }
1353    if_inst = tmp;
1354
1355    if (!emit_endif) {
1356       /* ENDIF is useless; don't bother emitting it. */
1357       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1358       return;
1359    }
1360
1361    if (intel->gen < 6) {
1362       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1363       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1364       brw_set_src1(p, insn, brw_imm_d(0x0));
1365    } else if (intel->gen == 6) {
1366       brw_set_dest(p, insn, brw_imm_w(0));
1367       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1368       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1369    } else {
1370       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1371       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1372       brw_set_src1(p, insn, brw_imm_ud(0));
1373    }
1374
1375    insn->header.compression_control = BRW_COMPRESSION_NONE;
1376    insn->header.mask_control = BRW_MASK_ENABLE;
1377    insn->header.thread_control = BRW_THREAD_SWITCH;
1378
1379    /* Also pop item off the stack in the endif instruction: */
1380    if (intel->gen < 6) {
1381       insn->bits3.if_else.jump_count = 0;
1382       insn->bits3.if_else.pop_count = 1;
1383       insn->bits3.if_else.pad0 = 0;
1384    } else if (intel->gen == 6) {
1385       insn->bits1.branch_gen6.jump_count = 2;
1386    } else {
1387       insn->bits3.break_cont.jip = 2;
1388    }
1389    patch_IF_ELSE(p, if_inst, else_inst, insn);
1390 }
1391
1392 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1393 {
1394    struct intel_context *intel = &p->brw->intel;
1395    struct brw_instruction *insn;
1396
1397    insn = next_insn(p, BRW_OPCODE_BREAK);
1398    if (intel->gen >= 6) {
1399       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1400       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1401       brw_set_src1(p, insn, brw_imm_d(0x0));
1402    } else {
1403       brw_set_dest(p, insn, brw_ip_reg());
1404       brw_set_src0(p, insn, brw_ip_reg());
1405       brw_set_src1(p, insn, brw_imm_d(0x0));
1406       insn->bits3.if_else.pad0 = 0;
1407       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1408    }
1409    insn->header.compression_control = BRW_COMPRESSION_NONE;
1410    insn->header.execution_size = BRW_EXECUTE_8;
1411
1412    return insn;
1413 }
1414
1415 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1416 {
1417    struct brw_instruction *insn;
1418
1419    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1420    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1421    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1422    brw_set_dest(p, insn, brw_ip_reg());
1423    brw_set_src0(p, insn, brw_ip_reg());
1424    brw_set_src1(p, insn, brw_imm_d(0x0));
1425
1426    insn->header.compression_control = BRW_COMPRESSION_NONE;
1427    insn->header.execution_size = BRW_EXECUTE_8;
1428    return insn;
1429 }
1430
1431 struct brw_instruction *brw_CONT(struct brw_compile *p)
1432 {
1433    struct brw_instruction *insn;
1434    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1435    brw_set_dest(p, insn, brw_ip_reg());
1436    brw_set_src0(p, insn, brw_ip_reg());
1437    brw_set_src1(p, insn, brw_imm_d(0x0));
1438    insn->header.compression_control = BRW_COMPRESSION_NONE;
1439    insn->header.execution_size = BRW_EXECUTE_8;
1440    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1441    insn->bits3.if_else.pad0 = 0;
1442    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1443    return insn;
1444 }
1445
1446 /* DO/WHILE loop:
1447  *
1448  * The DO/WHILE is just an unterminated loop -- break or continue are
1449  * used for control within the loop.  We have a few ways they can be
1450  * done.
1451  *
1452  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1453  * jip and no DO instruction.
1454  *
1455  * For non-uniform control flow pre-gen6, there's a DO instruction to
1456  * push the mask, and a WHILE to jump back, and BREAK to get out and
1457  * pop the mask.
1458  *
1459  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1460  * just points back to the first instruction of the loop.
1461  */
1462 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1463 {
1464    struct intel_context *intel = &p->brw->intel;
1465
1466    if (intel->gen >= 6 || p->single_program_flow) {
1467       push_loop_stack(p, &p->store[p->nr_insn]);
1468       return &p->store[p->nr_insn];
1469    } else {
1470       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1471
1472       push_loop_stack(p, insn);
1473
1474       /* Override the defaults for this instruction:
1475        */
1476       brw_set_dest(p, insn, brw_null_reg());
1477       brw_set_src0(p, insn, brw_null_reg());
1478       brw_set_src1(p, insn, brw_null_reg());
1479
1480       insn->header.compression_control = BRW_COMPRESSION_NONE;
1481       insn->header.execution_size = execute_size;
1482       insn->header.predicate_control = BRW_PREDICATE_NONE;
1483       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1484       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1485
1486       return insn;
1487    }
1488 }
1489
1490 /**
1491  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1492  * instruction here.
1493  *
1494  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1495  * nesting, since it can always just point to the end of the block/current loop.
1496  */
1497 static void
1498 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1499 {
1500    struct intel_context *intel = &p->brw->intel;
1501    struct brw_instruction *do_inst = get_inner_do_insn(p);
1502    struct brw_instruction *inst;
1503    int br = (intel->gen == 5) ? 2 : 1;
1504
1505    for (inst = while_inst - 1; inst != do_inst; inst--) {
1506       /* If the jump count is != 0, that means that this instruction has already
1507        * been patched because it's part of a loop inside of the one we're
1508        * patching.
1509        */
1510       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1511           inst->bits3.if_else.jump_count == 0) {
1512          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1513       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1514                  inst->bits3.if_else.jump_count == 0) {
1515          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1516       }
1517    }
1518 }
1519
1520 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1521 {
1522    struct intel_context *intel = &p->brw->intel;
1523    struct brw_instruction *insn, *do_insn;
1524    GLuint br = 1;
1525
1526    if (intel->gen >= 5)
1527       br = 2;
1528
1529    if (intel->gen >= 7) {
1530       insn = next_insn(p, BRW_OPCODE_WHILE);
1531       do_insn = get_inner_do_insn(p);
1532
1533       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535       brw_set_src1(p, insn, brw_imm_ud(0));
1536       insn->bits3.break_cont.jip = br * (do_insn - insn);
1537
1538       insn->header.execution_size = BRW_EXECUTE_8;
1539    } else if (intel->gen == 6) {
1540       insn = next_insn(p, BRW_OPCODE_WHILE);
1541       do_insn = get_inner_do_insn(p);
1542
1543       brw_set_dest(p, insn, brw_imm_w(0));
1544       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1545       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1546       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547
1548       insn->header.execution_size = BRW_EXECUTE_8;
1549    } else {
1550       if (p->single_program_flow) {
1551          insn = next_insn(p, BRW_OPCODE_ADD);
1552          do_insn = get_inner_do_insn(p);
1553
1554          brw_set_dest(p, insn, brw_ip_reg());
1555          brw_set_src0(p, insn, brw_ip_reg());
1556          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1557          insn->header.execution_size = BRW_EXECUTE_1;
1558       } else {
1559          insn = next_insn(p, BRW_OPCODE_WHILE);
1560          do_insn = get_inner_do_insn(p);
1561
1562          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1563
1564          brw_set_dest(p, insn, brw_ip_reg());
1565          brw_set_src0(p, insn, brw_ip_reg());
1566          brw_set_src1(p, insn, brw_imm_d(0));
1567
1568          insn->header.execution_size = do_insn->header.execution_size;
1569          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1570          insn->bits3.if_else.pop_count = 0;
1571          insn->bits3.if_else.pad0 = 0;
1572
1573          brw_patch_break_cont(p, insn);
1574       }
1575    }
1576    insn->header.compression_control = BRW_COMPRESSION_NONE;
1577    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1578
1579    p->loop_stack_depth--;
1580
1581    return insn;
1582 }
1583
1584
1585 /* FORWARD JUMPS:
1586  */
1587 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1588 {
1589    struct intel_context *intel = &p->brw->intel;
1590    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1591    GLuint jmpi = 1;
1592
1593    if (intel->gen >= 5)
1594       jmpi = 2;
1595
1596    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1597    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1598
1599    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1600 }
1601
1602
1603
1604 /* To integrate with the above, it makes sense that the comparison
1605  * instruction should populate the flag register.  It might be simpler
1606  * just to use the flag reg for most WM tasks?
1607  */
1608 void brw_CMP(struct brw_compile *p,
1609              struct brw_reg dest,
1610              GLuint conditional,
1611              struct brw_reg src0,
1612              struct brw_reg src1)
1613 {
1614    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1615
1616    insn->header.destreg__conditionalmod = conditional;
1617    brw_set_dest(p, insn, dest);
1618    brw_set_src0(p, insn, src0);
1619    brw_set_src1(p, insn, src1);
1620
1621 /*    guess_execution_size(insn, src0); */
1622
1623
1624    /* Make it so that future instructions will use the computed flag
1625     * value until brw_set_predicate_control_flag_value() is called
1626     * again.
1627     */
1628    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1629        dest.nr == 0) {
1630       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1631       p->flag_value = 0xff;
1632    }
1633 }
1634
1635 /* Issue 'wait' instruction for n1, host could program MMIO
1636    to wake up thread. */
1637 void brw_WAIT (struct brw_compile *p)
1638 {
1639    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1640    struct brw_reg src = brw_notification_1_reg();
1641
1642    brw_set_dest(p, insn, src);
1643    brw_set_src0(p, insn, src);
1644    brw_set_src1(p, insn, brw_null_reg());
1645    insn->header.execution_size = 0; /* must */
1646    insn->header.predicate_control = 0;
1647    insn->header.compression_control = 0;
1648 }
1649
1650
1651 /***********************************************************************
1652  * Helpers for the various SEND message types:
1653  */
1654
1655 /** Extended math function, float[8].
1656  */
1657 void brw_math( struct brw_compile *p,
1658                struct brw_reg dest,
1659                GLuint function,
1660                GLuint saturate,
1661                GLuint msg_reg_nr,
1662                struct brw_reg src,
1663                GLuint data_type,
1664                GLuint precision )
1665 {
1666    struct intel_context *intel = &p->brw->intel;
1667
1668    if (intel->gen >= 6) {
1669       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1670
1671       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1672       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1673
1674       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1675       if (intel->gen == 6)
1676          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1677
1678       /* Source modifiers are ignored for extended math instructions on Gen6. */
1679       if (intel->gen == 6) {
1680          assert(!src.negate);
1681          assert(!src.abs);
1682       }
1683
1684       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1685           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1686           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1687          assert(src.type != BRW_REGISTER_TYPE_F);
1688       } else {
1689          assert(src.type == BRW_REGISTER_TYPE_F);
1690       }
1691
1692       /* Math is the same ISA format as other opcodes, except that CondModifier
1693        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1694        */
1695       insn->header.destreg__conditionalmod = function;
1696       insn->header.saturate = saturate;
1697
1698       brw_set_dest(p, insn, dest);
1699       brw_set_src0(p, insn, src);
1700       brw_set_src1(p, insn, brw_null_reg());
1701    } else {
1702       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1703
1704       /* Example code doesn't set predicate_control for send
1705        * instructions.
1706        */
1707       insn->header.predicate_control = 0;
1708       insn->header.destreg__conditionalmod = msg_reg_nr;
1709
1710       brw_set_dest(p, insn, dest);
1711       brw_set_src0(p, insn, src);
1712       brw_set_math_message(p,
1713                            insn,
1714                            function,
1715                            src.type == BRW_REGISTER_TYPE_D,
1716                            precision,
1717                            saturate,
1718                            data_type);
1719    }
1720 }
1721
1722 /** Extended math function, float[8].
1723  */
1724 void brw_math2(struct brw_compile *p,
1725                struct brw_reg dest,
1726                GLuint function,
1727                struct brw_reg src0,
1728                struct brw_reg src1)
1729 {
1730    struct intel_context *intel = &p->brw->intel;
1731    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1732
1733    assert(intel->gen >= 6);
1734    (void) intel;
1735
1736
1737    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1738    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1739    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1740
1741    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1742    if (intel->gen == 6) {
1743       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1744       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1745    }
1746
1747    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1748        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1749        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1750       assert(src0.type != BRW_REGISTER_TYPE_F);
1751       assert(src1.type != BRW_REGISTER_TYPE_F);
1752    } else {
1753       assert(src0.type == BRW_REGISTER_TYPE_F);
1754       assert(src1.type == BRW_REGISTER_TYPE_F);
1755    }
1756
1757    /* Source modifiers are ignored for extended math instructions on Gen6. */
1758    if (intel->gen == 6) {
1759       assert(!src0.negate);
1760       assert(!src0.abs);
1761       assert(!src1.negate);
1762       assert(!src1.abs);
1763    }
1764
1765    /* Math is the same ISA format as other opcodes, except that CondModifier
1766     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1767     */
1768    insn->header.destreg__conditionalmod = function;
1769
1770    brw_set_dest(p, insn, dest);
1771    brw_set_src0(p, insn, src0);
1772    brw_set_src1(p, insn, src1);
1773 }
1774
1775 /**
1776  * Extended math function, float[16].
1777  * Use 2 send instructions.
1778  */
1779 void brw_math_16( struct brw_compile *p,
1780                   struct brw_reg dest,
1781                   GLuint function,
1782                   GLuint saturate,
1783                   GLuint msg_reg_nr,
1784                   struct brw_reg src,
1785                   GLuint precision )
1786 {
1787    struct intel_context *intel = &p->brw->intel;
1788    struct brw_instruction *insn;
1789
1790    if (intel->gen >= 6) {
1791       insn = next_insn(p, BRW_OPCODE_MATH);
1792
1793       /* Math is the same ISA format as other opcodes, except that CondModifier
1794        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1795        */
1796       insn->header.destreg__conditionalmod = function;
1797       insn->header.saturate = saturate;
1798
1799       /* Source modifiers are ignored for extended math instructions. */
1800       assert(!src.negate);
1801       assert(!src.abs);
1802
1803       brw_set_dest(p, insn, dest);
1804       brw_set_src0(p, insn, src);
1805       brw_set_src1(p, insn, brw_null_reg());
1806       return;
1807    }
1808
1809    /* First instruction:
1810     */
1811    brw_push_insn_state(p);
1812    brw_set_predicate_control_flag_value(p, 0xff);
1813    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1814
1815    insn = next_insn(p, BRW_OPCODE_SEND);
1816    insn->header.destreg__conditionalmod = msg_reg_nr;
1817
1818    brw_set_dest(p, insn, dest);
1819    brw_set_src0(p, insn, src);
1820    brw_set_math_message(p,
1821                         insn,
1822                         function,
1823                         BRW_MATH_INTEGER_UNSIGNED,
1824                         precision,
1825                         saturate,
1826                         BRW_MATH_DATA_VECTOR);
1827
1828    /* Second instruction:
1829     */
1830    insn = next_insn(p, BRW_OPCODE_SEND);
1831    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1832    insn->header.destreg__conditionalmod = msg_reg_nr+1;
1833
1834    brw_set_dest(p, insn, offset(dest,1));
1835    brw_set_src0(p, insn, src);
1836    brw_set_math_message(p,
1837                         insn,
1838                         function,
1839                         BRW_MATH_INTEGER_UNSIGNED,
1840                         precision,
1841                         saturate,
1842                         BRW_MATH_DATA_VECTOR);
1843
1844    brw_pop_insn_state(p);
1845 }
1846
1847
1848 /**
1849  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1850  * using a constant offset per channel.
1851  *
1852  * The offset must be aligned to oword size (16 bytes).  Used for
1853  * register spilling.
1854  */
1855 void brw_oword_block_write_scratch(struct brw_compile *p,
1856                                    struct brw_reg mrf,
1857                                    int num_regs,
1858                                    GLuint offset)
1859 {
1860    struct intel_context *intel = &p->brw->intel;
1861    uint32_t msg_control, msg_type;
1862    int mlen;
1863
1864    if (intel->gen >= 6)
1865       offset /= 16;
1866
1867    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1868
1869    if (num_regs == 1) {
1870       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1871       mlen = 2;
1872    } else {
1873       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1874       mlen = 3;
1875    }
1876
1877    /* Set up the message header.  This is g0, with g0.2 filled with
1878     * the offset.  We don't want to leave our offset around in g0 or
1879     * it'll screw up texture samples, so set it up inside the message
1880     * reg.
1881     */
1882    {
1883       brw_push_insn_state(p);
1884       brw_set_mask_control(p, BRW_MASK_DISABLE);
1885       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1886
1887       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1888
1889       /* set message header global offset field (reg 0, element 2) */
1890       brw_MOV(p,
1891               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1892                                   mrf.nr,
1893                                   2), BRW_REGISTER_TYPE_UD),
1894               brw_imm_ud(offset));
1895
1896       brw_pop_insn_state(p);
1897    }
1898
1899    {
1900       struct brw_reg dest;
1901       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1902       int send_commit_msg;
1903       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1904                                          BRW_REGISTER_TYPE_UW);
1905
1906       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1907          insn->header.compression_control = BRW_COMPRESSION_NONE;
1908          src_header = vec16(src_header);
1909       }
1910       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1911       insn->header.destreg__conditionalmod = mrf.nr;
1912
1913       /* Until gen6, writes followed by reads from the same location
1914        * are not guaranteed to be ordered unless write_commit is set.
1915        * If set, then a no-op write is issued to the destination
1916        * register to set a dependency, and a read from the destination
1917        * can be used to ensure the ordering.
1918        *
1919        * For gen6, only writes between different threads need ordering
1920        * protection.  Our use of DP writes is all about register
1921        * spilling within a thread.
1922        */
1923       if (intel->gen >= 6) {
1924          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1925          send_commit_msg = 0;
1926       } else {
1927          dest = src_header;
1928          send_commit_msg = 1;
1929       }
1930
1931       brw_set_dest(p, insn, dest);
1932       if (intel->gen >= 6) {
1933          brw_set_src0(p, insn, mrf);
1934       } else {
1935          brw_set_src0(p, insn, brw_null_reg());
1936       }
1937
1938       if (intel->gen >= 6)
1939          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1940       else
1941          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1942
1943       brw_set_dp_write_message(p,
1944                                insn,
1945                                255, /* binding table index (255=stateless) */
1946                                msg_control,
1947                                msg_type,
1948                                mlen,
1949                                true, /* header_present */
1950                                0, /* not a render target */
1951                                send_commit_msg, /* response_length */
1952                                0, /* eot */
1953                                send_commit_msg);
1954    }
1955 }
1956
1957
1958 /**
1959  * Read a block of owords (half a GRF each) from the scratch buffer
1960  * using a constant index per channel.
1961  *
1962  * Offset must be aligned to oword size (16 bytes).  Used for register
1963  * spilling.
1964  */
1965 void
1966 brw_oword_block_read_scratch(struct brw_compile *p,
1967                              struct brw_reg dest,
1968                              struct brw_reg mrf,
1969                              int num_regs,
1970                              GLuint offset)
1971 {
1972    struct intel_context *intel = &p->brw->intel;
1973    uint32_t msg_control;
1974    int rlen;
1975
1976    if (intel->gen >= 6)
1977       offset /= 16;
1978
1979    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1980    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1981
1982    if (num_regs == 1) {
1983       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1984       rlen = 1;
1985    } else {
1986       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1987       rlen = 2;
1988    }
1989
1990    {
1991       brw_push_insn_state(p);
1992       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1993       brw_set_mask_control(p, BRW_MASK_DISABLE);
1994
1995       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1996
1997       /* set message header global offset field (reg 0, element 2) */
1998       brw_MOV(p,
1999               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2000                                   mrf.nr,
2001                                   2), BRW_REGISTER_TYPE_UD),
2002               brw_imm_ud(offset));
2003
2004       brw_pop_insn_state(p);
2005    }
2006
2007    {
2008       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2009
2010       assert(insn->header.predicate_control == 0);
2011       insn->header.compression_control = BRW_COMPRESSION_NONE;
2012       insn->header.destreg__conditionalmod = mrf.nr;
2013
2014       brw_set_dest(p, insn, dest);      /* UW? */
2015       if (intel->gen >= 6) {
2016          brw_set_src0(p, insn, mrf);
2017       } else {
2018          brw_set_src0(p, insn, brw_null_reg());
2019       }
2020
2021       brw_set_dp_read_message(p,
2022                               insn,
2023                               255, /* binding table index (255=stateless) */
2024                               msg_control,
2025                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2026                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2027                               1, /* msg_length */
2028                               rlen);
2029    }
2030 }
2031
2032 /**
2033  * Read a float[4] vector from the data port Data Cache (const buffer).
2034  * Location (in buffer) should be a multiple of 16.
2035  * Used for fetching shader constants.
2036  */
2037 void brw_oword_block_read(struct brw_compile *p,
2038                           struct brw_reg dest,
2039                           struct brw_reg mrf,
2040                           uint32_t offset,
2041                           uint32_t bind_table_index)
2042 {
2043    struct intel_context *intel = &p->brw->intel;
2044
2045    /* On newer hardware, offset is in units of owords. */
2046    if (intel->gen >= 6)
2047       offset /= 16;
2048
2049    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2050
2051    brw_push_insn_state(p);
2052    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2053    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2054    brw_set_mask_control(p, BRW_MASK_DISABLE);
2055
2056    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2057
2058    /* set message header global offset field (reg 0, element 2) */
2059    brw_MOV(p,
2060            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2061                                mrf.nr,
2062                                2), BRW_REGISTER_TYPE_UD),
2063            brw_imm_ud(offset));
2064
2065    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2066    insn->header.destreg__conditionalmod = mrf.nr;
2067
2068    /* cast dest to a uword[8] vector */
2069    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2070
2071    brw_set_dest(p, insn, dest);
2072    if (intel->gen >= 6) {
2073       brw_set_src0(p, insn, mrf);
2074    } else {
2075       brw_set_src0(p, insn, brw_null_reg());
2076    }
2077
2078    brw_set_dp_read_message(p,
2079                            insn,
2080                            bind_table_index,
2081                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2082                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2083                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2084                            1, /* msg_length */
2085                            1); /* response_length (1 reg, 2 owords!) */
2086
2087    brw_pop_insn_state(p);
2088 }
2089
2090 /**
2091  * Read a set of dwords from the data port Data Cache (const buffer).
2092  *
2093  * Location (in buffer) appears as UD offsets in the register after
2094  * the provided mrf header reg.
2095  */
2096 void brw_dword_scattered_read(struct brw_compile *p,
2097                               struct brw_reg dest,
2098                               struct brw_reg mrf,
2099                               uint32_t bind_table_index)
2100 {
2101    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2102
2103    brw_push_insn_state(p);
2104    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2105    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2106    brw_set_mask_control(p, BRW_MASK_DISABLE);
2107    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2108    brw_pop_insn_state(p);
2109
2110    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2111    insn->header.destreg__conditionalmod = mrf.nr;
2112
2113    /* cast dest to a uword[8] vector */
2114    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2115
2116    brw_set_dest(p, insn, dest);
2117    brw_set_src0(p, insn, brw_null_reg());
2118
2119    brw_set_dp_read_message(p,
2120                            insn,
2121                            bind_table_index,
2122                            BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2123                            BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2124                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2125                            2, /* msg_length */
2126                            1); /* response_length */
2127 }
2128
2129
2130
2131 /**
2132  * Read float[4] constant(s) from VS constant buffer.
2133  * For relative addressing, two float[4] constants will be read into 'dest'.
2134  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2135  */
2136 void brw_dp_READ_4_vs(struct brw_compile *p,
2137                       struct brw_reg dest,
2138                       GLuint location,
2139                       GLuint bind_table_index)
2140 {
2141    struct intel_context *intel = &p->brw->intel;
2142    struct brw_instruction *insn;
2143    GLuint msg_reg_nr = 1;
2144
2145    if (intel->gen >= 6)
2146       location /= 16;
2147
2148    /* Setup MRF[1] with location/offset into const buffer */
2149    brw_push_insn_state(p);
2150    brw_set_access_mode(p, BRW_ALIGN_1);
2151    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2152    brw_set_mask_control(p, BRW_MASK_DISABLE);
2153    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2154    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2155                      BRW_REGISTER_TYPE_UD),
2156            brw_imm_ud(location));
2157    brw_pop_insn_state(p);
2158
2159    insn = next_insn(p, BRW_OPCODE_SEND);
2160
2161    insn->header.predicate_control = BRW_PREDICATE_NONE;
2162    insn->header.compression_control = BRW_COMPRESSION_NONE;
2163    insn->header.destreg__conditionalmod = msg_reg_nr;
2164    insn->header.mask_control = BRW_MASK_DISABLE;
2165
2166    brw_set_dest(p, insn, dest);
2167    if (intel->gen >= 6) {
2168       brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2169    } else {
2170       brw_set_src0(p, insn, brw_null_reg());
2171    }
2172
2173    brw_set_dp_read_message(p,
2174                            insn,
2175                            bind_table_index,
2176                            0,
2177                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2178                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2179                            1, /* msg_length */
2180                            1); /* response_length (1 Oword) */
2181 }
2182
2183 /**
2184  * Read a float[4] constant per vertex from VS constant buffer, with
2185  * relative addressing.
2186  */
2187 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2188                                struct brw_reg dest,
2189                                struct brw_reg addr_reg,
2190                                GLuint offset,
2191                                GLuint bind_table_index)
2192 {
2193    struct intel_context *intel = &p->brw->intel;
2194    struct brw_reg src = brw_vec8_grf(0, 0);
2195    int msg_type;
2196
2197    /* Setup MRF[1] with offset into const buffer */
2198    brw_push_insn_state(p);
2199    brw_set_access_mode(p, BRW_ALIGN_1);
2200    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2201    brw_set_mask_control(p, BRW_MASK_DISABLE);
2202    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2203
2204    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2205     * fields ignored.
2206     */
2207    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2208            addr_reg, brw_imm_d(offset));
2209    brw_pop_insn_state(p);
2210
2211    gen6_resolve_implied_move(p, &src, 0);
2212    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2213
2214    insn->header.predicate_control = BRW_PREDICATE_NONE;
2215    insn->header.compression_control = BRW_COMPRESSION_NONE;
2216    insn->header.destreg__conditionalmod = 0;
2217    insn->header.mask_control = BRW_MASK_DISABLE;
2218
2219    brw_set_dest(p, insn, dest);
2220    brw_set_src0(p, insn, src);
2221
2222    if (intel->gen >= 6)
2223       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2224    else if (intel->gen == 5 || intel->is_g4x)
2225       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2226    else
2227       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2228
2229    brw_set_dp_read_message(p,
2230                            insn,
2231                            bind_table_index,
2232                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2233                            msg_type,
2234                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2235                            2, /* msg_length */
2236                            1); /* response_length */
2237 }
2238
2239
2240
2241 void brw_fb_WRITE(struct brw_compile *p,
2242                   int dispatch_width,
2243                   GLuint msg_reg_nr,
2244                   struct brw_reg src0,
2245                   GLuint msg_control,
2246                   GLuint binding_table_index,
2247                   GLuint msg_length,
2248                   GLuint response_length,
2249                   bool eot,
2250                   bool header_present)
2251 {
2252    struct intel_context *intel = &p->brw->intel;
2253    struct brw_instruction *insn;
2254    GLuint msg_type;
2255    struct brw_reg dest;
2256
2257    if (dispatch_width == 16)
2258       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2259    else
2260       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2261
2262    if (intel->gen >= 6 && binding_table_index == 0) {
2263       insn = next_insn(p, BRW_OPCODE_SENDC);
2264    } else {
2265       insn = next_insn(p, BRW_OPCODE_SEND);
2266    }
2267    /* The execution mask is ignored for render target writes. */
2268    insn->header.predicate_control = 0;
2269    insn->header.compression_control = BRW_COMPRESSION_NONE;
2270
2271    if (intel->gen >= 6) {
2272       /* headerless version, just submit color payload */
2273       src0 = brw_message_reg(msg_reg_nr);
2274
2275       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2276    } else {
2277       insn->header.destreg__conditionalmod = msg_reg_nr;
2278
2279       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2280    }
2281
2282    brw_set_dest(p, insn, dest);
2283    brw_set_src0(p, insn, src0);
2284    brw_set_dp_write_message(p,
2285                             insn,
2286                             binding_table_index,
2287                             msg_control,
2288                             msg_type,
2289                             msg_length,
2290                             header_present,
2291                             eot, /* last render target write */
2292                             response_length,
2293                             eot,
2294                             0 /* send_commit_msg */);
2295 }
2296
2297
2298 /**
2299  * Texture sample instruction.
2300  * Note: the msg_type plus msg_length values determine exactly what kind
2301  * of sampling operation is performed.  See volume 4, page 161 of docs.
2302  */
2303 void brw_SAMPLE(struct brw_compile *p,
2304                 struct brw_reg dest,
2305                 GLuint msg_reg_nr,
2306                 struct brw_reg src0,
2307                 GLuint binding_table_index,
2308                 GLuint sampler,
2309                 GLuint writemask,
2310                 GLuint msg_type,
2311                 GLuint response_length,
2312                 GLuint msg_length,
2313                 GLuint header_present,
2314                 GLuint simd_mode,
2315                 GLuint return_format)
2316 {
2317    struct intel_context *intel = &p->brw->intel;
2318    bool need_stall = 0;
2319
2320    if (writemask == 0) {
2321       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2322       return;
2323    }
2324
2325    /* Hardware doesn't do destination dependency checking on send
2326     * instructions properly.  Add a workaround which generates the
2327     * dependency by other means.  In practice it seems like this bug
2328     * only crops up for texture samples, and only where registers are
2329     * written by the send and then written again later without being
2330     * read in between.  Luckily for us, we already track that
2331     * information and use it to modify the writemask for the
2332     * instruction, so that is a guide for whether a workaround is
2333     * needed.
2334     */
2335    if (writemask != WRITEMASK_XYZW) {
2336       GLuint dst_offset = 0;
2337       GLuint i, newmask = 0, len = 0;
2338
2339       for (i = 0; i < 4; i++) {
2340          if (writemask & (1<<i))
2341             break;
2342          dst_offset += 2;
2343       }
2344       for (; i < 4; i++) {
2345          if (!(writemask & (1<<i)))
2346             break;
2347          newmask |= 1<<i;
2348          len++;
2349       }
2350
2351       if (newmask != writemask) {
2352          need_stall = 1;
2353          /* printf("need stall %x %x\n", newmask , writemask); */
2354       }
2355       else {
2356          bool dispatch_16 = false;
2357
2358          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2359
2360          guess_execution_size(p, p->current, dest);
2361          if (p->current->header.execution_size == BRW_EXECUTE_16)
2362             dispatch_16 = true;
2363
2364          newmask = ~newmask & WRITEMASK_XYZW;
2365
2366          brw_push_insn_state(p);
2367
2368          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2369          brw_set_mask_control(p, BRW_MASK_DISABLE);
2370
2371          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2372                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2373          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2374
2375          brw_pop_insn_state(p);
2376
2377          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2378          dest = offset(dest, dst_offset);
2379
2380          /* For 16-wide dispatch, masked channels are skipped in the
2381           * response.  For 8-wide, masked channels still take up slots,
2382           * and are just not written to.
2383           */
2384          if (dispatch_16)
2385             response_length = len * 2;
2386       }
2387    }
2388
2389    {
2390       struct brw_instruction *insn;
2391
2392       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2393
2394       insn = next_insn(p, BRW_OPCODE_SEND);
2395       insn->header.predicate_control = 0; /* XXX */
2396       insn->header.compression_control = BRW_COMPRESSION_NONE;
2397       if (intel->gen < 6)
2398           insn->header.destreg__conditionalmod = msg_reg_nr;
2399
2400       brw_set_dest(p, insn, dest);
2401       brw_set_src0(p, insn, src0);
2402       brw_set_sampler_message(p, insn,
2403                               binding_table_index,
2404                               sampler,
2405                               msg_type,
2406                               response_length,
2407                               msg_length,
2408                               header_present,
2409                               simd_mode,
2410                               return_format);
2411    }
2412
2413    if (need_stall) {
2414       struct brw_reg reg = vec8(offset(dest, response_length-1));
2415
2416       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2417        */
2418       brw_push_insn_state(p);
2419       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2420       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2421               retype(reg, BRW_REGISTER_TYPE_UD));
2422       brw_pop_insn_state(p);
2423    }
2424
2425 }
2426
2427 /* All these variables are pretty confusing - we might be better off
2428  * using bitmasks and macros for this, in the old style.  Or perhaps
2429  * just having the caller instantiate the fields in dword3 itself.
2430  */
2431 void brw_urb_WRITE(struct brw_compile *p,
2432                    struct brw_reg dest,
2433                    GLuint msg_reg_nr,
2434                    struct brw_reg src0,
2435                    bool allocate,
2436                    bool used,
2437                    GLuint msg_length,
2438                    GLuint response_length,
2439                    bool eot,
2440                    bool writes_complete,
2441                    GLuint offset,
2442                    GLuint swizzle)
2443 {
2444    struct intel_context *intel = &p->brw->intel;
2445    struct brw_instruction *insn;
2446
2447    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2448
2449    if (intel->gen == 7) {
2450       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2451       brw_push_insn_state(p);
2452       brw_set_access_mode(p, BRW_ALIGN_1);
2453       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2454                        BRW_REGISTER_TYPE_UD),
2455                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2456                 brw_imm_ud(0xff00));
2457       brw_pop_insn_state(p);
2458    }
2459
2460    insn = next_insn(p, BRW_OPCODE_SEND);
2461
2462    assert(msg_length < BRW_MAX_MRF);
2463
2464    brw_set_dest(p, insn, dest);
2465    brw_set_src0(p, insn, src0);
2466    brw_set_src1(p, insn, brw_imm_d(0));
2467
2468    if (intel->gen < 6)
2469       insn->header.destreg__conditionalmod = msg_reg_nr;
2470
2471    brw_set_urb_message(p,
2472                        insn,
2473                        allocate,
2474                        used,
2475                        msg_length,
2476                        response_length,
2477                        eot,
2478                        writes_complete,
2479                        offset,
2480                        swizzle);
2481 }
2482
2483 static int
2484 brw_find_next_block_end(struct brw_compile *p, int start)
2485 {
2486    int ip;
2487
2488    for (ip = start + 1; ip < p->nr_insn; ip++) {
2489       struct brw_instruction *insn = &p->store[ip];
2490
2491       switch (insn->header.opcode) {
2492       case BRW_OPCODE_ENDIF:
2493       case BRW_OPCODE_ELSE:
2494       case BRW_OPCODE_WHILE:
2495          return ip;
2496       }
2497    }
2498    assert(!"not reached");
2499    return start + 1;
2500 }
2501
2502 /* There is no DO instruction on gen6, so to find the end of the loop
2503  * we have to see if the loop is jumping back before our start
2504  * instruction.
2505  */
2506 static int
2507 brw_find_loop_end(struct brw_compile *p, int start)
2508 {
2509    struct intel_context *intel = &p->brw->intel;
2510    int ip;
2511    int br = 2;
2512
2513    for (ip = start + 1; ip < p->nr_insn; ip++) {
2514       struct brw_instruction *insn = &p->store[ip];
2515
2516       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2517          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2518                                    : insn->bits3.break_cont.jip;
2519          if (ip + jip / br <= start)
2520             return ip;
2521       }
2522    }
2523    assert(!"not reached");
2524    return start + 1;
2525 }
2526
2527 /* After program generation, go back and update the UIP and JIP of
2528  * BREAK and CONT instructions to their correct locations.
2529  */
2530 void
2531 brw_set_uip_jip(struct brw_compile *p)
2532 {
2533    struct intel_context *intel = &p->brw->intel;
2534    int ip;
2535    int br = 2;
2536
2537    if (intel->gen < 6)
2538       return;
2539
2540    for (ip = 0; ip < p->nr_insn; ip++) {
2541       struct brw_instruction *insn = &p->store[ip];
2542
2543       switch (insn->header.opcode) {
2544       case BRW_OPCODE_BREAK:
2545          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2546          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2547          insn->bits3.break_cont.uip =
2548             br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2549          break;
2550       case BRW_OPCODE_CONTINUE:
2551          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2552          insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2553
2554          assert(insn->bits3.break_cont.uip != 0);
2555          assert(insn->bits3.break_cont.jip != 0);
2556          break;
2557       }
2558    }
2559 }
2560
2561 void brw_ff_sync(struct brw_compile *p,
2562                    struct brw_reg dest,
2563                    GLuint msg_reg_nr,
2564                    struct brw_reg src0,
2565                    bool allocate,
2566                    GLuint response_length,
2567                    bool eot)
2568 {
2569    struct intel_context *intel = &p->brw->intel;
2570    struct brw_instruction *insn;
2571
2572    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2573
2574    insn = next_insn(p, BRW_OPCODE_SEND);
2575    brw_set_dest(p, insn, dest);
2576    brw_set_src0(p, insn, src0);
2577    brw_set_src1(p, insn, brw_imm_d(0));
2578
2579    if (intel->gen < 6)
2580       insn->header.destreg__conditionalmod = msg_reg_nr;
2581
2582    brw_set_ff_sync_message(p,
2583                            insn,
2584                            allocate,
2585                            response_length,
2586                            eot);
2587 }
2588
2589 /**
2590  * Emit the SEND instruction necessary to generate stream output data on Gen6
2591  * (for transform feedback).
2592  *
2593  * If send_commit_msg is true, this is the last piece of stream output data
2594  * from this thread, so send the data as a committed write.  According to the
2595  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2596  *
2597  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2598  *   writes are complete by sending the final write as a committed write."
2599  */
2600 void
2601 brw_svb_write(struct brw_compile *p,
2602               struct brw_reg dest,
2603               GLuint msg_reg_nr,
2604               struct brw_reg src0,
2605               GLuint binding_table_index,
2606               bool   send_commit_msg)
2607 {
2608    struct brw_instruction *insn;
2609
2610    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2611
2612    insn = next_insn(p, BRW_OPCODE_SEND);
2613    brw_set_dest(p, insn, dest);
2614    brw_set_src0(p, insn, src0);
2615    brw_set_src1(p, insn, brw_imm_d(0));
2616    brw_set_dp_write_message(p, insn,
2617                             binding_table_index,
2618                             0, /* msg_control: ignored */
2619                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2620                             1, /* msg_length */
2621                             true, /* header_present */
2622                             0, /* last_render_target: ignored */
2623                             send_commit_msg, /* response_length */
2624                             0, /* end_of_thread */
2625                             send_commit_msg); /* send_commit_msg */
2626 }