src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct brw_context *brw = p->brw;
  67    if (brw->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct brw_context *brw = p->brw;
  96    if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 131           *    this to be programmed as "01".
 132           */
 133          insn->bits1.da16.dest_horiz_stride = 1;
 134       }
 135    }
 136    else {
 137       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 138
 139       /* These are different sizes in align1 vs align16:
 140        */
 141       if (insn->header.access_mode == BRW_ALIGN_1) {
 142          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 143          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 144             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 145          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 146       }
 147       else {
 148          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 149          /* even ignored in da16, still need to set as '01' */
 150          insn->bits1.ia16.dest_horiz_stride = 1;
 151       }
 152    }
 153
 154    /* NEW: Set the execution size based on dest.width and
 155     * insn->compression_control:
 156     */
 157    guess_execution_size(p, insn, dest);
 158 }
 159
 160 extern int reg_type_size[];
 161
 162 static void
 163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 164 {
 165    int hstride_for_reg[] = {0, 1, 2, 4};
 166    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 167    int width_for_reg[] = {1, 2, 4, 8, 16};
 168    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 169    int width, hstride, vstride, execsize;
 170
 171    if (reg.file == BRW_IMMEDIATE_VALUE) {
 172       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 173        * mean the destination has to be 128-bit aligned and the
 174        * destination horiz stride has to be a word.
 175        */
 176       if (reg.type == BRW_REGISTER_TYPE_V) {
 177          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 178                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 179       }
 180
 181       return;
 182    }
 183
 184    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 185        reg.file == BRW_ARF_NULL)
 186       return;
 187
 188    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 189    hstride = hstride_for_reg[reg.hstride];
 190
 191    if (reg.vstride == 0xf) {
 192       vstride = -1;
 193    } else {
 194       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 195       vstride = vstride_for_reg[reg.vstride];
 196    }
 197
 198    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 199    width = width_for_reg[reg.width];
 200
 201    assert(insn->header.execution_size >= 0 &&
 202           insn->header.execution_size < Elements(execsize_for_reg));
 203    execsize = execsize_for_reg[insn->header.execution_size];
 204
 205    /* Restrictions from 3.3.10: Register Region Restrictions. */
 206    /* 3. */
 207    assert(execsize >= width);
 208
 209    /* 4. */
 210    if (execsize == width && hstride != 0) {
 211       assert(vstride == -1 || vstride == width * hstride);
 212    }
 213
 214    /* 5. */
 215    if (execsize == width && hstride == 0) {
 216       /* no restriction on vstride. */
 217    }
 218
 219    /* 6. */
 220    if (width == 1) {
 221       assert(hstride == 0);
 222    }
 223
 224    /* 7. */
 225    if (execsize == 1 && width == 1) {
 226       assert(hstride == 0);
 227       assert(vstride == 0);
 228    }
 229
 230    /* 8. */
 231    if (vstride == 0 && hstride == 0) {
 232       assert(width == 1);
 233    }
 234
 235    /* 10. Check destination issues. */
 236 }
 237
 238 void
 239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 240              struct brw_reg reg)
 241 {
 242    struct brw_context *brw = p->brw;
 243
 244    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 245       assert(reg.nr < 128);
 246
 247    gen7_convert_mrf_to_grf(p, &reg);
 248
 249    if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 250                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 251       /* Any source modifiers or regions will be ignored, since this just
 252        * identifies the MRF/GRF to start reading the message contents from.
 253        * Check for some likely failures.
 254        */
 255       assert(!reg.negate);
 256       assert(!reg.abs);
 257       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 258    }
 259
 260    validate_reg(insn, reg);
 261
 262    insn->bits1.da1.src0_reg_file = reg.file;
 263    insn->bits1.da1.src0_reg_type = reg.type;
 264    insn->bits2.da1.src0_abs = reg.abs;
 265    insn->bits2.da1.src0_negate = reg.negate;
 266    insn->bits2.da1.src0_address_mode = reg.address_mode;
 267
 268    if (reg.file == BRW_IMMEDIATE_VALUE) {
 269       insn->bits3.ud = reg.dw1.ud;
 270
 271       /* Required to set some fields in src1 as well:
 272        */
 273       insn->bits1.da1.src1_reg_file = 0; /* arf */
 274       insn->bits1.da1.src1_reg_type = reg.type;
 275    }
 276    else
 277    {
 278       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 279          if (insn->header.access_mode == BRW_ALIGN_1) {
 280             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 281             insn->bits2.da1.src0_reg_nr = reg.nr;
 282          }
 283          else {
 284             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 285             insn->bits2.da16.src0_reg_nr = reg.nr;
 286          }
 287       }
 288       else {
 289          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 290
 291          if (insn->header.access_mode == BRW_ALIGN_1) {
 292             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 293          }
 294          else {
 295             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 296          }
 297       }
 298
 299       if (insn->header.access_mode == BRW_ALIGN_1) {
 300          if (reg.width == BRW_WIDTH_1 &&
 301              insn->header.execution_size == BRW_EXECUTE_1) {
 302             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 303             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 304             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 305          }
 306          else {
 307             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 308             insn->bits2.da1.src0_width = reg.width;
 309             insn->bits2.da1.src0_vert_stride = reg.vstride;
 310          }
 311       }
 312       else {
 313          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 314          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 315          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 316          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 317
 318          /* This is an oddity of the fact we're using the same
 319           * descriptions for registers in align_16 as align_1:
 320           */
 321          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 322             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 323          else
 324             insn->bits2.da16.src0_vert_stride = reg.vstride;
 325       }
 326    }
 327 }
 328
 329
 330 void brw_set_src1(struct brw_compile *p,
 331                   struct brw_instruction *insn,
 332                   struct brw_reg reg)
 333 {
 334    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 335
 336    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 337       assert(reg.nr < 128);
 338
 339    gen7_convert_mrf_to_grf(p, &reg);
 340
 341    validate_reg(insn, reg);
 342
 343    insn->bits1.da1.src1_reg_file = reg.file;
 344    insn->bits1.da1.src1_reg_type = reg.type;
 345    insn->bits3.da1.src1_abs = reg.abs;
 346    insn->bits3.da1.src1_negate = reg.negate;
 347
 348    /* Only src1 can be immediate in two-argument instructions.
 349     */
 350    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 351
 352    if (reg.file == BRW_IMMEDIATE_VALUE) {
 353       insn->bits3.ud = reg.dw1.ud;
 354    }
 355    else {
 356       /* This is a hardware restriction, which may or may not be lifted
 357        * in the future:
 358        */
 359       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 360       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 361
 362       if (insn->header.access_mode == BRW_ALIGN_1) {
 363          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 364          insn->bits3.da1.src1_reg_nr = reg.nr;
 365       }
 366       else {
 367          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 368          insn->bits3.da16.src1_reg_nr = reg.nr;
 369       }
 370
 371       if (insn->header.access_mode == BRW_ALIGN_1) {
 372          if (reg.width == BRW_WIDTH_1 &&
 373              insn->header.execution_size == BRW_EXECUTE_1) {
 374             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 375             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 376             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 377          }
 378          else {
 379             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 380             insn->bits3.da1.src1_width = reg.width;
 381             insn->bits3.da1.src1_vert_stride = reg.vstride;
 382          }
 383       }
 384       else {
 385          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 386          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 387          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 388          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 389
 390          /* This is an oddity of the fact we're using the same
 391           * descriptions for registers in align_16 as align_1:
 392           */
 393          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 394             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 395          else
 396             insn->bits3.da16.src1_vert_stride = reg.vstride;
 397       }
 398    }
 399 }
 400
 401 /**
 402  * Set the Message Descriptor and Extended Message Descriptor fields
 403  * for SEND messages.
 404  *
 405  * \note This zeroes out the Function Control bits, so it must be called
 406  *       \b before filling out any message-specific data.  Callers can
 407  *       choose not to fill in irrelevant bits; they will be zero.
 408  */
 409 static void
 410 brw_set_message_descriptor(struct brw_compile *p,
 411                            struct brw_instruction *inst,
 412                            enum brw_message_target sfid,
 413                            unsigned msg_length,
 414                            unsigned response_length,
 415                            bool header_present,
 416                            bool end_of_thread)
 417 {
 418    struct brw_context *brw = p->brw;
 419
 420    brw_set_src1(p, inst, brw_imm_d(0));
 421
 422    if (brw->gen >= 5) {
 423       inst->bits3.generic_gen5.header_present = header_present;
 424       inst->bits3.generic_gen5.response_length = response_length;
 425       inst->bits3.generic_gen5.msg_length = msg_length;
 426       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 427
 428       if (brw->gen >= 6) {
 429          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 430          inst->header.destreg__conditionalmod = sfid;
 431       } else {
 432          /* Set Extended Message Descriptor (ex_desc) */
 433          inst->bits2.send_gen5.sfid = sfid;
 434          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 435       }
 436    } else {
 437       inst->bits3.generic.response_length = response_length;
 438       inst->bits3.generic.msg_length = msg_length;
 439       inst->bits3.generic.msg_target = sfid;
 440       inst->bits3.generic.end_of_thread = end_of_thread;
 441    }
 442 }
 443
 444 static void brw_set_math_message( struct brw_compile *p,
 445                                   struct brw_instruction *insn,
 446                                   GLuint function,
 447                                   GLuint integer_type,
 448                                   bool low_precision,
 449                                   GLuint dataType )
 450 {
 451    struct brw_context *brw = p->brw;
 452    unsigned msg_length;
 453    unsigned response_length;
 454
 455    /* Infer message length from the function */
 456    switch (function) {
 457    case BRW_MATH_FUNCTION_POW:
 458    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 459    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 461       msg_length = 2;
 462       break;
 463    default:
 464       msg_length = 1;
 465       break;
 466    }
 467
 468    /* Infer response length from the function */
 469    switch (function) {
 470    case BRW_MATH_FUNCTION_SINCOS:
 471    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 472       response_length = 2;
 473       break;
 474    default:
 475       response_length = 1;
 476       break;
 477    }
 478
 479
 480    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 481                               msg_length, response_length, false, false);
 482    if (brw->gen == 5) {
 483       insn->bits3.math_gen5.function = function;
 484       insn->bits3.math_gen5.int_type = integer_type;
 485       insn->bits3.math_gen5.precision = low_precision;
 486       insn->bits3.math_gen5.saturate = insn->header.saturate;
 487       insn->bits3.math_gen5.data_type = dataType;
 488       insn->bits3.math_gen5.snapshot = 0;
 489    } else {
 490       insn->bits3.math.function = function;
 491       insn->bits3.math.int_type = integer_type;
 492       insn->bits3.math.precision = low_precision;
 493       insn->bits3.math.saturate = insn->header.saturate;
 494       insn->bits3.math.data_type = dataType;
 495    }
 496    insn->header.saturate = 0;
 497 }
 498
 499
 500 static void brw_set_ff_sync_message(struct brw_compile *p,
 501                                     struct brw_instruction *insn,
 502                                     bool allocate,
 503                                     GLuint response_length,
 504                                     bool end_of_thread)
 505 {
 506    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 507                               1, response_length, true, end_of_thread);
 508    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 509    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 510    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 511    insn->bits3.urb_gen5.allocate = allocate;
 512    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 514 }
 515
 516 static void brw_set_urb_message( struct brw_compile *p,
 517                                  struct brw_instruction *insn,
 518                                  enum brw_urb_write_flags flags,
 519                                  GLuint msg_length,
 520                                  GLuint response_length,
 521                                  GLuint offset,
 522                                  GLuint swizzle_control )
 523 {
 524    struct brw_context *brw = p->brw;
 525
 526    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 527                               msg_length, response_length, true,
 528                               flags & BRW_URB_WRITE_EOT);
 529    if (brw->gen == 7) {
 530       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 531       insn->bits3.urb_gen7.offset = offset;
 532       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 533       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 534       insn->bits3.urb_gen7.per_slot_offset =
 535          flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
 536       insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 537    } else if (brw->gen >= 5) {
 538       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 539       insn->bits3.urb_gen5.offset = offset;
 540       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 541       insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 542       insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 543       insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 544    } else {
 545       insn->bits3.urb.opcode = 0;       /* ? */
 546       insn->bits3.urb.offset = offset;
 547       insn->bits3.urb.swizzle_control = swizzle_control;
 548       insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 549       insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 550       insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 551    }
 552 }
 553
 554 void
 555 brw_set_dp_write_message(struct brw_compile *p,
 556                          struct brw_instruction *insn,
 557                          GLuint binding_table_index,
 558                          GLuint msg_control,
 559                          GLuint msg_type,
 560                          GLuint msg_length,
 561                          bool header_present,
 562                          GLuint last_render_target,
 563                          GLuint response_length,
 564                          GLuint end_of_thread,
 565                          GLuint send_commit_msg)
 566 {
 567    struct brw_context *brw = p->brw;
 568    unsigned sfid;
 569
 570    if (brw->gen >= 7) {
 571       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 572       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 573          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 574       else
 575          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 576    } else if (brw->gen == 6) {
 577       /* Use the render cache for all write messages. */
 578       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 579    } else {
 580       sfid = BRW_SFID_DATAPORT_WRITE;
 581    }
 582
 583    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 584                               header_present, end_of_thread);
 585
 586    if (brw->gen >= 7) {
 587       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 588       insn->bits3.gen7_dp.msg_control = msg_control;
 589       insn->bits3.gen7_dp.last_render_target = last_render_target;
 590       insn->bits3.gen7_dp.msg_type = msg_type;
 591    } else if (brw->gen == 6) {
 592       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 593       insn->bits3.gen6_dp.msg_control = msg_control;
 594       insn->bits3.gen6_dp.last_render_target = last_render_target;
 595       insn->bits3.gen6_dp.msg_type = msg_type;
 596       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 597    } else if (brw->gen == 5) {
 598       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 599       insn->bits3.dp_write_gen5.msg_control = msg_control;
 600       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 601       insn->bits3.dp_write_gen5.msg_type = msg_type;
 602       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 603    } else {
 604       insn->bits3.dp_write.binding_table_index = binding_table_index;
 605       insn->bits3.dp_write.msg_control = msg_control;
 606       insn->bits3.dp_write.last_render_target = last_render_target;
 607       insn->bits3.dp_write.msg_type = msg_type;
 608       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 609    }
 610 }
 611
 612 void
 613 brw_set_dp_read_message(struct brw_compile *p,
 614                         struct brw_instruction *insn,
 615                         GLuint binding_table_index,
 616                         GLuint msg_control,
 617                         GLuint msg_type,
 618                         GLuint target_cache,
 619                         GLuint msg_length,
 620                         bool header_present,
 621                         GLuint response_length)
 622 {
 623    struct brw_context *brw = p->brw;
 624    unsigned sfid;
 625
 626    if (brw->gen >= 7) {
 627       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 628    } else if (brw->gen == 6) {
 629       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 630          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 631       else
 632          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 633    } else {
 634       sfid = BRW_SFID_DATAPORT_READ;
 635    }
 636
 637    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 638                               header_present, false);
 639
 640    if (brw->gen >= 7) {
 641       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 642       insn->bits3.gen7_dp.msg_control = msg_control;
 643       insn->bits3.gen7_dp.last_render_target = 0;
 644       insn->bits3.gen7_dp.msg_type = msg_type;
 645    } else if (brw->gen == 6) {
 646       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 647       insn->bits3.gen6_dp.msg_control = msg_control;
 648       insn->bits3.gen6_dp.last_render_target = 0;
 649       insn->bits3.gen6_dp.msg_type = msg_type;
 650       insn->bits3.gen6_dp.send_commit_msg = 0;
 651    } else if (brw->gen == 5) {
 652       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 653       insn->bits3.dp_read_gen5.msg_control = msg_control;
 654       insn->bits3.dp_read_gen5.msg_type = msg_type;
 655       insn->bits3.dp_read_gen5.target_cache = target_cache;
 656    } else if (brw->is_g4x) {
 657       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 658       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 659       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 660       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 661    } else {
 662       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 663       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 664       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 665       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 666    }
 667 }
 668
 669 void
 670 brw_set_sampler_message(struct brw_compile *p,
 671                         struct brw_instruction *insn,
 672                         GLuint binding_table_index,
 673                         GLuint sampler,
 674                         GLuint msg_type,
 675                         GLuint response_length,
 676                         GLuint msg_length,
 677                         GLuint header_present,
 678                         GLuint simd_mode,
 679                         GLuint return_format)
 680 {
 681    struct brw_context *brw = p->brw;
 682
 683    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 684                               response_length, header_present, false);
 685
 686    if (brw->gen >= 7) {
 687       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 688       insn->bits3.sampler_gen7.sampler = sampler;
 689       insn->bits3.sampler_gen7.msg_type = msg_type;
 690       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 691    } else if (brw->gen >= 5) {
 692       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 693       insn->bits3.sampler_gen5.sampler = sampler;
 694       insn->bits3.sampler_gen5.msg_type = msg_type;
 695       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 696    } else if (brw->is_g4x) {
 697       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 698       insn->bits3.sampler_g4x.sampler = sampler;
 699       insn->bits3.sampler_g4x.msg_type = msg_type;
 700    } else {
 701       insn->bits3.sampler.binding_table_index = binding_table_index;
 702       insn->bits3.sampler.sampler = sampler;
 703       insn->bits3.sampler.msg_type = msg_type;
 704       insn->bits3.sampler.return_format = return_format;
 705    }
 706 }
 707
 708
 709 #define next_insn brw_next_insn
 710 struct brw_instruction *
 711 brw_next_insn(struct brw_compile *p, GLuint opcode)
 712 {
 713    struct brw_instruction *insn;
 714
 715    if (p->nr_insn + 1 > p->store_size) {
 716       if (0)
 717          printf("incresing the store size to %d\n", p->store_size << 1);
 718       p->store_size <<= 1;
 719       p->store = reralloc(p->mem_ctx, p->store,
 720                           struct brw_instruction, p->store_size);
 721       if (!p->store)
 722          assert(!"realloc eu store memeory failed");
 723    }
 724
 725    p->next_insn_offset += 16;
 726    insn = &p->store[p->nr_insn++];
 727    memcpy(insn, p->current, sizeof(*insn));
 728
 729    /* Reset this one-shot flag:
 730     */
 731
 732    if (p->current->header.destreg__conditionalmod) {
 733       p->current->header.destreg__conditionalmod = 0;
 734       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 735    }
 736
 737    insn->header.opcode = opcode;
 738    return insn;
 739 }
 740
 741 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 742                                          GLuint opcode,
 743                                          struct brw_reg dest,
 744                                          struct brw_reg src )
 745 {
 746    struct brw_instruction *insn = next_insn(p, opcode);
 747    brw_set_dest(p, insn, dest);
 748    brw_set_src0(p, insn, src);
 749    return insn;
 750 }
 751
 752 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 753                                         GLuint opcode,
 754                                         struct brw_reg dest,
 755                                         struct brw_reg src0,
 756                                         struct brw_reg src1 )
 757 {
 758    struct brw_instruction *insn = next_insn(p, opcode);
 759    brw_set_dest(p, insn, dest);
 760    brw_set_src0(p, insn, src0);
 761    brw_set_src1(p, insn, src1);
 762    return insn;
 763 }
 764
 765 static int
 766 get_3src_subreg_nr(struct brw_reg reg)
 767 {
 768    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 769       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 770       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 771    } else {
 772       return reg.subnr / 4;
 773    }
 774 }
 775
 776 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 777                                         GLuint opcode,
 778                                         struct brw_reg dest,
 779                                         struct brw_reg src0,
 780                                         struct brw_reg src1,
 781                                         struct brw_reg src2)
 782 {
 783    struct brw_context *brw = p->brw;
 784    struct brw_instruction *insn = next_insn(p, opcode);
 785
 786    gen7_convert_mrf_to_grf(p, &dest);
 787
 788    assert(insn->header.access_mode == BRW_ALIGN_16);
 789
 790    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 791           dest.file == BRW_MESSAGE_REGISTER_FILE);
 792    assert(dest.nr < 128);
 793    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 794    assert(dest.type == BRW_REGISTER_TYPE_F ||
 795           dest.type == BRW_REGISTER_TYPE_D ||
 796           dest.type == BRW_REGISTER_TYPE_UD);
 797    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 798    insn->bits1.da3src.dest_reg_nr = dest.nr;
 799    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 800    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 801    guess_execution_size(p, insn, dest);
 802
 803    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 804    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 805    assert(src0.nr < 128);
 806    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 807    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 808    insn->bits2.da3src.src0_reg_nr = src0.nr;
 809    insn->bits1.da3src.src0_abs = src0.abs;
 810    insn->bits1.da3src.src0_negate = src0.negate;
 811    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 812
 813    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 814    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 815    assert(src1.nr < 128);
 816    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 817    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 818    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 819    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 820    insn->bits3.da3src.src1_reg_nr = src1.nr;
 821    insn->bits1.da3src.src1_abs = src1.abs;
 822    insn->bits1.da3src.src1_negate = src1.negate;
 823
 824    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 825    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 826    assert(src2.nr < 128);
 827    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 828    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 829    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 830    insn->bits3.da3src.src2_reg_nr = src2.nr;
 831    insn->bits1.da3src.src2_abs = src2.abs;
 832    insn->bits1.da3src.src2_negate = src2.negate;
 833
 834    if (brw->gen >= 7) {
 835       /* Set both the source and destination types based on dest.type,
 836        * ignoring the source register types.  The MAD and LRP emitters ensure
 837        * that all four types are float.  The BFE and BFI2 emitters, however,
 838        * may send us mixed D and UD types and want us to ignore that and use
 839        * the destination type.
 840        */
 841       switch (dest.type) {
 842       case BRW_REGISTER_TYPE_F:
 843          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
 844          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
 845          break;
 846       case BRW_REGISTER_TYPE_D:
 847          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
 848          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
 849          break;
 850       case BRW_REGISTER_TYPE_UD:
 851          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
 852          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
 853          break;
 854       }
 855    }
 856
 857    return insn;
 858 }
 859
 860
 861 /***********************************************************************
 862  * Convenience routines.
 863  */
 864 #define ALU1(OP)                                        \
 865 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 866               struct brw_reg dest,                      \
 867               struct brw_reg src0)                      \
 868 {                                                       \
 869    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 870 }
 871
 872 #define ALU2(OP)                                        \
 873 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 874               struct brw_reg dest,                      \
 875               struct brw_reg src0,                      \
 876               struct brw_reg src1)                      \
 877 {                                                       \
 878    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 879 }
 880
 881 #define ALU3(OP)                                        \
 882 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 883               struct brw_reg dest,                      \
 884               struct brw_reg src0,                      \
 885               struct brw_reg src1,                      \
 886               struct brw_reg src2)                      \
 887 {                                                       \
 888    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 889 }
 890
 891 #define ALU3F(OP)                                               \
 892 struct brw_instruction *brw_##OP(struct brw_compile *p,         \
 893                                  struct brw_reg dest,           \
 894                                  struct brw_reg src0,           \
 895                                  struct brw_reg src1,           \
 896                                  struct brw_reg src2)           \
 897 {                                                               \
 898    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
 899    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
 900    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
 901    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
 902    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 903 }
 904
 905 /* Rounding operations (other than RNDD) require two instructions - the first
 906  * stores a rounded value (possibly the wrong way) in the dest register, but
 907  * also sets a per-channel "increment bit" in the flag register.  A predicated
 908  * add of 1.0 fixes dest to contain the desired result.
 909  *
 910  * Sandybridge and later appear to round correctly without an ADD.
 911  */
 912 #define ROUND(OP)                                                             \
 913 void brw_##OP(struct brw_compile *p,                                          \
 914               struct brw_reg dest,                                            \
 915               struct brw_reg src)                                             \
 916 {                                                                             \
 917    struct brw_instruction *rnd, *add;                                         \
 918    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 919    brw_set_dest(p, rnd, dest);                                                \
 920    brw_set_src0(p, rnd, src);                                                 \
 921                                                                               \
 922    if (p->brw->gen < 6) {                                                     \
 923       /* turn on round-increments */                                          \
 924       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 925       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 926       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 927    }                                                                          \
 928 }
 929
 930
 931 ALU1(MOV)
 932 ALU2(SEL)
 933 ALU1(NOT)
 934 ALU2(AND)
 935 ALU2(OR)
 936 ALU2(XOR)
 937 ALU2(SHR)
 938 ALU2(SHL)
 939 ALU2(RSR)
 940 ALU2(RSL)
 941 ALU2(ASR)
 942 ALU1(F32TO16)
 943 ALU1(F16TO32)
 944 ALU1(FRC)
 945 ALU1(RNDD)
 946 ALU2(MAC)
 947 ALU2(MACH)
 948 ALU1(LZD)
 949 ALU2(DP4)
 950 ALU2(DPH)
 951 ALU2(DP3)
 952 ALU2(DP2)
 953 ALU2(LINE)
 954 ALU2(PLN)
 955 ALU3F(MAD)
 956 ALU3F(LRP)
 957 ALU1(BFREV)
 958 ALU3(BFE)
 959 ALU2(BFI1)
 960 ALU3(BFI2)
 961 ALU1(FBH)
 962 ALU1(FBL)
 963 ALU1(CBIT)
 964
 965 ROUND(RNDZ)
 966 ROUND(RNDE)
 967
 968
 969 struct brw_instruction *brw_ADD(struct brw_compile *p,
 970                                 struct brw_reg dest,
 971                                 struct brw_reg src0,
 972                                 struct brw_reg src1)
 973 {
 974    /* 6.2.2: add */
 975    if (src0.type == BRW_REGISTER_TYPE_F ||
 976        (src0.file == BRW_IMMEDIATE_VALUE &&
 977         src0.type == BRW_REGISTER_TYPE_VF)) {
 978       assert(src1.type != BRW_REGISTER_TYPE_UD);
 979       assert(src1.type != BRW_REGISTER_TYPE_D);
 980    }
 981
 982    if (src1.type == BRW_REGISTER_TYPE_F ||
 983        (src1.file == BRW_IMMEDIATE_VALUE &&
 984         src1.type == BRW_REGISTER_TYPE_VF)) {
 985       assert(src0.type != BRW_REGISTER_TYPE_UD);
 986       assert(src0.type != BRW_REGISTER_TYPE_D);
 987    }
 988
 989    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 990 }
 991
 992 struct brw_instruction *brw_AVG(struct brw_compile *p,
 993                                 struct brw_reg dest,
 994                                 struct brw_reg src0,
 995                                 struct brw_reg src1)
 996 {
 997    assert(dest.type == src0.type);
 998    assert(src0.type == src1.type);
 999    switch (src0.type) {
1000    case BRW_REGISTER_TYPE_B:
1001    case BRW_REGISTER_TYPE_UB:
1002    case BRW_REGISTER_TYPE_W:
1003    case BRW_REGISTER_TYPE_UW:
1004    case BRW_REGISTER_TYPE_D:
1005    case BRW_REGISTER_TYPE_UD:
1006       break;
1007    default:
1008       assert(!"Bad type for brw_AVG");
1009    }
1010
1011    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1012 }
1013
1014 struct brw_instruction *brw_MUL(struct brw_compile *p,
1015                                 struct brw_reg dest,
1016                                 struct brw_reg src0,
1017                                 struct brw_reg src1)
1018 {
1019    /* 6.32.38: mul */
1020    if (src0.type == BRW_REGISTER_TYPE_D ||
1021        src0.type == BRW_REGISTER_TYPE_UD ||
1022        src1.type == BRW_REGISTER_TYPE_D ||
1023        src1.type == BRW_REGISTER_TYPE_UD) {
1024       assert(dest.type != BRW_REGISTER_TYPE_F);
1025    }
1026
1027    if (src0.type == BRW_REGISTER_TYPE_F ||
1028        (src0.file == BRW_IMMEDIATE_VALUE &&
1029         src0.type == BRW_REGISTER_TYPE_VF)) {
1030       assert(src1.type != BRW_REGISTER_TYPE_UD);
1031       assert(src1.type != BRW_REGISTER_TYPE_D);
1032    }
1033
1034    if (src1.type == BRW_REGISTER_TYPE_F ||
1035        (src1.file == BRW_IMMEDIATE_VALUE &&
1036         src1.type == BRW_REGISTER_TYPE_VF)) {
1037       assert(src0.type != BRW_REGISTER_TYPE_UD);
1038       assert(src0.type != BRW_REGISTER_TYPE_D);
1039    }
1040
1041    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1042           src0.nr != BRW_ARF_ACCUMULATOR);
1043    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1044           src1.nr != BRW_ARF_ACCUMULATOR);
1045
1046    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1047 }
1048
1049
1050 void brw_NOP(struct brw_compile *p)
1051 {
1052    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1053    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1054    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1055    brw_set_src1(p, insn, brw_imm_ud(0x0));
1056 }
1057
1058
1059
1060
1061
1062 /***********************************************************************
1063  * Comparisons, if/else/endif
1064  */
1065
1066 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1067                                  struct brw_reg dest,
1068                                  struct brw_reg src0,
1069                                  struct brw_reg src1)
1070 {
1071    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1072
1073    insn->header.execution_size = 1;
1074    insn->header.compression_control = BRW_COMPRESSION_NONE;
1075    insn->header.mask_control = BRW_MASK_DISABLE;
1076
1077    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1078
1079    return insn;
1080 }
1081
1082 static void
1083 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1084 {
1085    p->if_stack[p->if_stack_depth] = inst - p->store;
1086
1087    p->if_stack_depth++;
1088    if (p->if_stack_array_size <= p->if_stack_depth) {
1089       p->if_stack_array_size *= 2;
1090       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1091                              p->if_stack_array_size);
1092    }
1093 }
1094
1095 static struct brw_instruction *
1096 pop_if_stack(struct brw_compile *p)
1097 {
1098    p->if_stack_depth--;
1099    return &p->store[p->if_stack[p->if_stack_depth]];
1100 }
1101
1102 static void
1103 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1104 {
1105    if (p->loop_stack_array_size < p->loop_stack_depth) {
1106       p->loop_stack_array_size *= 2;
1107       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1108                                p->loop_stack_array_size);
1109       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1110                                      p->loop_stack_array_size);
1111    }
1112
1113    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1114    p->loop_stack_depth++;
1115    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1116 }
1117
1118 static struct brw_instruction *
1119 get_inner_do_insn(struct brw_compile *p)
1120 {
1121    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1122 }
1123
1124 /* EU takes the value from the flag register and pushes it onto some
1125  * sort of a stack (presumably merging with any flag value already on
1126  * the stack).  Within an if block, the flags at the top of the stack
1127  * control execution on each channel of the unit, eg. on each of the
1128  * 16 pixel values in our wm programs.
1129  *
1130  * When the matching 'else' instruction is reached (presumably by
1131  * countdown of the instruction count patched in by our ELSE/ENDIF
1132  * functions), the relevent flags are inverted.
1133  *
1134  * When the matching 'endif' instruction is reached, the flags are
1135  * popped off.  If the stack is now empty, normal execution resumes.
1136  */
1137 struct brw_instruction *
1138 brw_IF(struct brw_compile *p, GLuint execute_size)
1139 {
1140    struct brw_context *brw = p->brw;
1141    struct brw_instruction *insn;
1142
1143    insn = next_insn(p, BRW_OPCODE_IF);
1144
1145    /* Override the defaults for this instruction:
1146     */
1147    if (brw->gen < 6) {
1148       brw_set_dest(p, insn, brw_ip_reg());
1149       brw_set_src0(p, insn, brw_ip_reg());
1150       brw_set_src1(p, insn, brw_imm_d(0x0));
1151    } else if (brw->gen == 6) {
1152       brw_set_dest(p, insn, brw_imm_w(0));
1153       insn->bits1.branch_gen6.jump_count = 0;
1154       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1155       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1156    } else {
1157       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1158       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1159       brw_set_src1(p, insn, brw_imm_ud(0));
1160       insn->bits3.break_cont.jip = 0;
1161       insn->bits3.break_cont.uip = 0;
1162    }
1163
1164    insn->header.execution_size = execute_size;
1165    insn->header.compression_control = BRW_COMPRESSION_NONE;
1166    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1167    insn->header.mask_control = BRW_MASK_ENABLE;
1168    if (!p->single_program_flow)
1169       insn->header.thread_control = BRW_THREAD_SWITCH;
1170
1171    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1172
1173    push_if_stack(p, insn);
1174    p->if_depth_in_loop[p->loop_stack_depth]++;
1175    return insn;
1176 }
1177
1178 /* This function is only used for gen6-style IF instructions with an
1179  * embedded comparison (conditional modifier).  It is not used on gen7.
1180  */
1181 struct brw_instruction *
1182 gen6_IF(struct brw_compile *p, uint32_t conditional,
1183         struct brw_reg src0, struct brw_reg src1)
1184 {
1185    struct brw_instruction *insn;
1186
1187    insn = next_insn(p, BRW_OPCODE_IF);
1188
1189    brw_set_dest(p, insn, brw_imm_w(0));
1190    if (p->compressed) {
1191       insn->header.execution_size = BRW_EXECUTE_16;
1192    } else {
1193       insn->header.execution_size = BRW_EXECUTE_8;
1194    }
1195    insn->bits1.branch_gen6.jump_count = 0;
1196    brw_set_src0(p, insn, src0);
1197    brw_set_src1(p, insn, src1);
1198
1199    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1200    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1201    insn->header.destreg__conditionalmod = conditional;
1202
1203    if (!p->single_program_flow)
1204       insn->header.thread_control = BRW_THREAD_SWITCH;
1205
1206    push_if_stack(p, insn);
1207    return insn;
1208 }
1209
1210 /**
1211  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1212  */
1213 static void
1214 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1215                        struct brw_instruction *if_inst,
1216                        struct brw_instruction *else_inst)
1217 {
1218    /* The next instruction (where the ENDIF would be, if it existed) */
1219    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1220
1221    assert(p->single_program_flow);
1222    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1223    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1224    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1225
1226    /* Convert IF to an ADD instruction that moves the instruction pointer
1227     * to the first instruction of the ELSE block.  If there is no ELSE
1228     * block, point to where ENDIF would be.  Reverse the predicate.
1229     *
1230     * There's no need to execute an ENDIF since we don't need to do any
1231     * stack operations, and if we're currently executing, we just want to
1232     * continue normally.
1233     */
1234    if_inst->header.opcode = BRW_OPCODE_ADD;
1235    if_inst->header.predicate_inverse = 1;
1236
1237    if (else_inst != NULL) {
1238       /* Convert ELSE to an ADD instruction that points where the ENDIF
1239        * would be.
1240        */
1241       else_inst->header.opcode = BRW_OPCODE_ADD;
1242
1243       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1244       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1245    } else {
1246       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1247    }
1248 }
1249
1250 /**
1251  * Patch IF and ELSE instructions with appropriate jump targets.
1252  */
1253 static void
1254 patch_IF_ELSE(struct brw_compile *p,
1255               struct brw_instruction *if_inst,
1256               struct brw_instruction *else_inst,
1257               struct brw_instruction *endif_inst)
1258 {
1259    struct brw_context *brw = p->brw;
1260
1261    /* We shouldn't be patching IF and ELSE instructions in single program flow
1262     * mode when gen < 6, because in single program flow mode on those
1263     * platforms, we convert flow control instructions to conditional ADDs that
1264     * operate on IP (see brw_ENDIF).
1265     *
1266     * However, on Gen6, writing to IP doesn't work in single program flow mode
1267     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1268     * not be updated by non-flow control instructions.").  And on later
1269     * platforms, there is no significant benefit to converting control flow
1270     * instructions to conditional ADDs.  So we do patch IF and ELSE
1271     * instructions in single program flow mode on those platforms.
1272     */
1273    if (brw->gen < 6)
1274       assert(!p->single_program_flow);
1275
1276    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1277    assert(endif_inst != NULL);
1278    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1279
1280    unsigned br = 1;
1281    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1282     * requires 2 chunks.
1283     */
1284    if (brw->gen >= 5)
1285       br = 2;
1286
1287    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1288    endif_inst->header.execution_size = if_inst->header.execution_size;
1289
1290    if (else_inst == NULL) {
1291       /* Patch IF -> ENDIF */
1292       if (brw->gen < 6) {
1293          /* Turn it into an IFF, which means no mask stack operations for
1294           * all-false and jumping past the ENDIF.
1295           */
1296          if_inst->header.opcode = BRW_OPCODE_IFF;
1297          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1298          if_inst->bits3.if_else.pop_count = 0;
1299          if_inst->bits3.if_else.pad0 = 0;
1300       } else if (brw->gen == 6) {
1301          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1302          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1303       } else {
1304          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1305          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1306       }
1307    } else {
1308       else_inst->header.execution_size = if_inst->header.execution_size;
1309
1310       /* Patch IF -> ELSE */
1311       if (brw->gen < 6) {
1312          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1313          if_inst->bits3.if_else.pop_count = 0;
1314          if_inst->bits3.if_else.pad0 = 0;
1315       } else if (brw->gen == 6) {
1316          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1317       }
1318
1319       /* Patch ELSE -> ENDIF */
1320       if (brw->gen < 6) {
1321          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1322           * matching ENDIF.
1323           */
1324          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1325          else_inst->bits3.if_else.pop_count = 1;
1326          else_inst->bits3.if_else.pad0 = 0;
1327       } else if (brw->gen == 6) {
1328          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1329          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1330       } else {
1331          /* The IF instruction's JIP should point just past the ELSE */
1332          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1333          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1334          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1335          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1336       }
1337    }
1338 }
1339
1340 void
1341 brw_ELSE(struct brw_compile *p)
1342 {
1343    struct brw_context *brw = p->brw;
1344    struct brw_instruction *insn;
1345
1346    insn = next_insn(p, BRW_OPCODE_ELSE);
1347
1348    if (brw->gen < 6) {
1349       brw_set_dest(p, insn, brw_ip_reg());
1350       brw_set_src0(p, insn, brw_ip_reg());
1351       brw_set_src1(p, insn, brw_imm_d(0x0));
1352    } else if (brw->gen == 6) {
1353       brw_set_dest(p, insn, brw_imm_w(0));
1354       insn->bits1.branch_gen6.jump_count = 0;
1355       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1356       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1357    } else {
1358       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1359       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1360       brw_set_src1(p, insn, brw_imm_ud(0));
1361       insn->bits3.break_cont.jip = 0;
1362       insn->bits3.break_cont.uip = 0;
1363    }
1364
1365    insn->header.compression_control = BRW_COMPRESSION_NONE;
1366    insn->header.mask_control = BRW_MASK_ENABLE;
1367    if (!p->single_program_flow)
1368       insn->header.thread_control = BRW_THREAD_SWITCH;
1369
1370    push_if_stack(p, insn);
1371 }
1372
1373 void
1374 brw_ENDIF(struct brw_compile *p)
1375 {
1376    struct brw_context *brw = p->brw;
1377    struct brw_instruction *insn = NULL;
1378    struct brw_instruction *else_inst = NULL;
1379    struct brw_instruction *if_inst = NULL;
1380    struct brw_instruction *tmp;
1381    bool emit_endif = true;
1382
1383    /* In single program flow mode, we can express IF and ELSE instructions
1384     * equivalently as ADD instructions that operate on IP.  On platforms prior
1385     * to Gen6, flow control instructions cause an implied thread switch, so
1386     * this is a significant savings.
1387     *
1388     * However, on Gen6, writing to IP doesn't work in single program flow mode
1389     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1390     * not be updated by non-flow control instructions.").  And on later
1391     * platforms, there is no significant benefit to converting control flow
1392     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1393     * Gen5.
1394     */
1395    if (brw->gen < 6 && p->single_program_flow)
1396       emit_endif = false;
1397
1398    /*
1399     * A single next_insn() may change the base adress of instruction store
1400     * memory(p->store), so call it first before referencing the instruction
1401     * store pointer from an index
1402     */
1403    if (emit_endif)
1404       insn = next_insn(p, BRW_OPCODE_ENDIF);
1405
1406    /* Pop the IF and (optional) ELSE instructions from the stack */
1407    p->if_depth_in_loop[p->loop_stack_depth]--;
1408    tmp = pop_if_stack(p);
1409    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1410       else_inst = tmp;
1411       tmp = pop_if_stack(p);
1412    }
1413    if_inst = tmp;
1414
1415    if (!emit_endif) {
1416       /* ENDIF is useless; don't bother emitting it. */
1417       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1418       return;
1419    }
1420
1421    if (brw->gen < 6) {
1422       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1423       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1424       brw_set_src1(p, insn, brw_imm_d(0x0));
1425    } else if (brw->gen == 6) {
1426       brw_set_dest(p, insn, brw_imm_w(0));
1427       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1428       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1429    } else {
1430       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1431       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1432       brw_set_src1(p, insn, brw_imm_ud(0));
1433    }
1434
1435    insn->header.compression_control = BRW_COMPRESSION_NONE;
1436    insn->header.mask_control = BRW_MASK_ENABLE;
1437    insn->header.thread_control = BRW_THREAD_SWITCH;
1438
1439    /* Also pop item off the stack in the endif instruction: */
1440    if (brw->gen < 6) {
1441       insn->bits3.if_else.jump_count = 0;
1442       insn->bits3.if_else.pop_count = 1;
1443       insn->bits3.if_else.pad0 = 0;
1444    } else if (brw->gen == 6) {
1445       insn->bits1.branch_gen6.jump_count = 2;
1446    } else {
1447       insn->bits3.break_cont.jip = 2;
1448    }
1449    patch_IF_ELSE(p, if_inst, else_inst, insn);
1450 }
1451
1452 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1453 {
1454    struct brw_context *brw = p->brw;
1455    struct brw_instruction *insn;
1456
1457    insn = next_insn(p, BRW_OPCODE_BREAK);
1458    if (brw->gen >= 6) {
1459       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1461       brw_set_src1(p, insn, brw_imm_d(0x0));
1462    } else {
1463       brw_set_dest(p, insn, brw_ip_reg());
1464       brw_set_src0(p, insn, brw_ip_reg());
1465       brw_set_src1(p, insn, brw_imm_d(0x0));
1466       insn->bits3.if_else.pad0 = 0;
1467       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1468    }
1469    insn->header.compression_control = BRW_COMPRESSION_NONE;
1470    insn->header.execution_size = BRW_EXECUTE_8;
1471
1472    return insn;
1473 }
1474
1475 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1476 {
1477    struct brw_instruction *insn;
1478
1479    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1480    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1481    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1482    brw_set_dest(p, insn, brw_ip_reg());
1483    brw_set_src0(p, insn, brw_ip_reg());
1484    brw_set_src1(p, insn, brw_imm_d(0x0));
1485
1486    insn->header.compression_control = BRW_COMPRESSION_NONE;
1487    insn->header.execution_size = BRW_EXECUTE_8;
1488    return insn;
1489 }
1490
1491 struct brw_instruction *brw_CONT(struct brw_compile *p)
1492 {
1493    struct brw_instruction *insn;
1494    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1495    brw_set_dest(p, insn, brw_ip_reg());
1496    brw_set_src0(p, insn, brw_ip_reg());
1497    brw_set_src1(p, insn, brw_imm_d(0x0));
1498    insn->header.compression_control = BRW_COMPRESSION_NONE;
1499    insn->header.execution_size = BRW_EXECUTE_8;
1500    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1501    insn->bits3.if_else.pad0 = 0;
1502    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503    return insn;
1504 }
1505
1506 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1507 {
1508    struct brw_instruction *insn;
1509
1510    insn = next_insn(p, BRW_OPCODE_HALT);
1511    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1512    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1513    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1514
1515    if (p->compressed) {
1516       insn->header.execution_size = BRW_EXECUTE_16;
1517    } else {
1518       insn->header.compression_control = BRW_COMPRESSION_NONE;
1519       insn->header.execution_size = BRW_EXECUTE_8;
1520    }
1521    return insn;
1522 }
1523
1524 /* DO/WHILE loop:
1525  *
1526  * The DO/WHILE is just an unterminated loop -- break or continue are
1527  * used for control within the loop.  We have a few ways they can be
1528  * done.
1529  *
1530  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1531  * jip and no DO instruction.
1532  *
1533  * For non-uniform control flow pre-gen6, there's a DO instruction to
1534  * push the mask, and a WHILE to jump back, and BREAK to get out and
1535  * pop the mask.
1536  *
1537  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1538  * just points back to the first instruction of the loop.
1539  */
1540 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1541 {
1542    struct brw_context *brw = p->brw;
1543
1544    if (brw->gen >= 6 || p->single_program_flow) {
1545       push_loop_stack(p, &p->store[p->nr_insn]);
1546       return &p->store[p->nr_insn];
1547    } else {
1548       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1549
1550       push_loop_stack(p, insn);
1551
1552       /* Override the defaults for this instruction:
1553        */
1554       brw_set_dest(p, insn, brw_null_reg());
1555       brw_set_src0(p, insn, brw_null_reg());
1556       brw_set_src1(p, insn, brw_null_reg());
1557
1558       insn->header.compression_control = BRW_COMPRESSION_NONE;
1559       insn->header.execution_size = execute_size;
1560       insn->header.predicate_control = BRW_PREDICATE_NONE;
1561       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1562       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1563
1564       return insn;
1565    }
1566 }
1567
1568 /**
1569  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1570  * instruction here.
1571  *
1572  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1573  * nesting, since it can always just point to the end of the block/current loop.
1574  */
1575 static void
1576 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1577 {
1578    struct brw_context *brw = p->brw;
1579    struct brw_instruction *do_inst = get_inner_do_insn(p);
1580    struct brw_instruction *inst;
1581    int br = (brw->gen == 5) ? 2 : 1;
1582
1583    for (inst = while_inst - 1; inst != do_inst; inst--) {
1584       /* If the jump count is != 0, that means that this instruction has already
1585        * been patched because it's part of a loop inside of the one we're
1586        * patching.
1587        */
1588       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1589           inst->bits3.if_else.jump_count == 0) {
1590          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1591       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1592                  inst->bits3.if_else.jump_count == 0) {
1593          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1594       }
1595    }
1596 }
1597
1598 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1599 {
1600    struct brw_context *brw = p->brw;
1601    struct brw_instruction *insn, *do_insn;
1602    GLuint br = 1;
1603
1604    if (brw->gen >= 5)
1605       br = 2;
1606
1607    if (brw->gen >= 7) {
1608       insn = next_insn(p, BRW_OPCODE_WHILE);
1609       do_insn = get_inner_do_insn(p);
1610
1611       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613       brw_set_src1(p, insn, brw_imm_ud(0));
1614       insn->bits3.break_cont.jip = br * (do_insn - insn);
1615
1616       insn->header.execution_size = BRW_EXECUTE_8;
1617    } else if (brw->gen == 6) {
1618       insn = next_insn(p, BRW_OPCODE_WHILE);
1619       do_insn = get_inner_do_insn(p);
1620
1621       brw_set_dest(p, insn, brw_imm_w(0));
1622       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1623       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1624       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1625
1626       insn->header.execution_size = BRW_EXECUTE_8;
1627    } else {
1628       if (p->single_program_flow) {
1629          insn = next_insn(p, BRW_OPCODE_ADD);
1630          do_insn = get_inner_do_insn(p);
1631
1632          brw_set_dest(p, insn, brw_ip_reg());
1633          brw_set_src0(p, insn, brw_ip_reg());
1634          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1635          insn->header.execution_size = BRW_EXECUTE_1;
1636       } else {
1637          insn = next_insn(p, BRW_OPCODE_WHILE);
1638          do_insn = get_inner_do_insn(p);
1639
1640          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1641
1642          brw_set_dest(p, insn, brw_ip_reg());
1643          brw_set_src0(p, insn, brw_ip_reg());
1644          brw_set_src1(p, insn, brw_imm_d(0));
1645
1646          insn->header.execution_size = do_insn->header.execution_size;
1647          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1648          insn->bits3.if_else.pop_count = 0;
1649          insn->bits3.if_else.pad0 = 0;
1650
1651          brw_patch_break_cont(p, insn);
1652       }
1653    }
1654    insn->header.compression_control = BRW_COMPRESSION_NONE;
1655    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1656
1657    p->loop_stack_depth--;
1658
1659    return insn;
1660 }
1661
1662
1663 /* FORWARD JUMPS:
1664  */
1665 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1666 {
1667    struct brw_context *brw = p->brw;
1668    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1669    GLuint jmpi = 1;
1670
1671    if (brw->gen >= 5)
1672       jmpi = 2;
1673
1674    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1675    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1676
1677    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1678 }
1679
1680
1681
1682 /* To integrate with the above, it makes sense that the comparison
1683  * instruction should populate the flag register.  It might be simpler
1684  * just to use the flag reg for most WM tasks?
1685  */
1686 void brw_CMP(struct brw_compile *p,
1687              struct brw_reg dest,
1688              GLuint conditional,
1689              struct brw_reg src0,
1690              struct brw_reg src1)
1691 {
1692    struct brw_context *brw = p->brw;
1693    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1694
1695    insn->header.destreg__conditionalmod = conditional;
1696    brw_set_dest(p, insn, dest);
1697    brw_set_src0(p, insn, src0);
1698    brw_set_src1(p, insn, src1);
1699
1700 /*    guess_execution_size(insn, src0); */
1701
1702
1703    /* Make it so that future instructions will use the computed flag
1704     * value until brw_set_predicate_control_flag_value() is called
1705     * again.
1706     */
1707    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1708        dest.nr == 0) {
1709       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1710       p->flag_value = 0xff;
1711    }
1712
1713    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1714     * page says:
1715     *    "Any CMP instruction with a null destination must use a {switch}."
1716     *
1717     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1718     * mentioned on their work-arounds pages.
1719     */
1720    if (brw->gen == 7) {
1721       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1722           dest.nr == BRW_ARF_NULL) {
1723          insn->header.thread_control = BRW_THREAD_SWITCH;
1724       }
1725    }
1726 }
1727
1728 /* Issue 'wait' instruction for n1, host could program MMIO
1729    to wake up thread. */
1730 void brw_WAIT (struct brw_compile *p)
1731 {
1732    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1733    struct brw_reg src = brw_notification_1_reg();
1734
1735    brw_set_dest(p, insn, src);
1736    brw_set_src0(p, insn, src);
1737    brw_set_src1(p, insn, brw_null_reg());
1738    insn->header.execution_size = 0; /* must */
1739    insn->header.predicate_control = 0;
1740    insn->header.compression_control = 0;
1741 }
1742
1743
1744 /***********************************************************************
1745  * Helpers for the various SEND message types:
1746  */
1747
1748 /** Extended math function, float[8].
1749  */
1750 void brw_math( struct brw_compile *p,
1751                struct brw_reg dest,
1752                GLuint function,
1753                GLuint msg_reg_nr,
1754                struct brw_reg src,
1755                GLuint data_type,
1756                GLuint precision )
1757 {
1758    struct brw_context *brw = p->brw;
1759
1760    if (brw->gen >= 6) {
1761       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1762
1763       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1764              (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1765       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1766
1767       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1768       if (brw->gen == 6)
1769          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1770
1771       /* Source modifiers are ignored for extended math instructions on Gen6. */
1772       if (brw->gen == 6) {
1773          assert(!src.negate);
1774          assert(!src.abs);
1775       }
1776
1777       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1778           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1779           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1780          assert(src.type != BRW_REGISTER_TYPE_F);
1781       } else {
1782          assert(src.type == BRW_REGISTER_TYPE_F);
1783       }
1784
1785       /* Math is the same ISA format as other opcodes, except that CondModifier
1786        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1787        */
1788       insn->header.destreg__conditionalmod = function;
1789
1790       brw_set_dest(p, insn, dest);
1791       brw_set_src0(p, insn, src);
1792       brw_set_src1(p, insn, brw_null_reg());
1793    } else {
1794       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1795
1796       /* Example code doesn't set predicate_control for send
1797        * instructions.
1798        */
1799       insn->header.predicate_control = 0;
1800       insn->header.destreg__conditionalmod = msg_reg_nr;
1801
1802       brw_set_dest(p, insn, dest);
1803       brw_set_src0(p, insn, src);
1804       brw_set_math_message(p,
1805                            insn,
1806                            function,
1807                            src.type == BRW_REGISTER_TYPE_D,
1808                            precision,
1809                            data_type);
1810    }
1811 }
1812
1813 /** Extended math function, float[8].
1814  */
1815 void brw_math2(struct brw_compile *p,
1816                struct brw_reg dest,
1817                GLuint function,
1818                struct brw_reg src0,
1819                struct brw_reg src1)
1820 {
1821    struct brw_context *brw = p->brw;
1822    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1823
1824    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1825           (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1826    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1827    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1828
1829    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1830    if (brw->gen == 6) {
1831       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1832       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1833    }
1834
1835    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1836        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1837        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1838       assert(src0.type != BRW_REGISTER_TYPE_F);
1839       assert(src1.type != BRW_REGISTER_TYPE_F);
1840    } else {
1841       assert(src0.type == BRW_REGISTER_TYPE_F);
1842       assert(src1.type == BRW_REGISTER_TYPE_F);
1843    }
1844
1845    /* Source modifiers are ignored for extended math instructions on Gen6. */
1846    if (brw->gen == 6) {
1847       assert(!src0.negate);
1848       assert(!src0.abs);
1849       assert(!src1.negate);
1850       assert(!src1.abs);
1851    }
1852
1853    /* Math is the same ISA format as other opcodes, except that CondModifier
1854     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1855     */
1856    insn->header.destreg__conditionalmod = function;
1857
1858    brw_set_dest(p, insn, dest);
1859    brw_set_src0(p, insn, src0);
1860    brw_set_src1(p, insn, src1);
1861 }
1862
1863
1864 /**
1865  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1866  * using a constant offset per channel.
1867  *
1868  * The offset must be aligned to oword size (16 bytes).  Used for
1869  * register spilling.
1870  */
1871 void brw_oword_block_write_scratch(struct brw_compile *p,
1872                                    struct brw_reg mrf,
1873                                    int num_regs,
1874                                    GLuint offset)
1875 {
1876    struct brw_context *brw = p->brw;
1877    uint32_t msg_control, msg_type;
1878    int mlen;
1879
1880    if (brw->gen >= 6)
1881       offset /= 16;
1882
1883    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1884
1885    if (num_regs == 1) {
1886       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1887       mlen = 2;
1888    } else {
1889       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1890       mlen = 3;
1891    }
1892
1893    /* Set up the message header.  This is g0, with g0.2 filled with
1894     * the offset.  We don't want to leave our offset around in g0 or
1895     * it'll screw up texture samples, so set it up inside the message
1896     * reg.
1897     */
1898    {
1899       brw_push_insn_state(p);
1900       brw_set_mask_control(p, BRW_MASK_DISABLE);
1901       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1902
1903       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1904
1905       /* set message header global offset field (reg 0, element 2) */
1906       brw_MOV(p,
1907               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1908                                   mrf.nr,
1909                                   2), BRW_REGISTER_TYPE_UD),
1910               brw_imm_ud(offset));
1911
1912       brw_pop_insn_state(p);
1913    }
1914
1915    {
1916       struct brw_reg dest;
1917       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1918       int send_commit_msg;
1919       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1920                                          BRW_REGISTER_TYPE_UW);
1921
1922       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1923          insn->header.compression_control = BRW_COMPRESSION_NONE;
1924          src_header = vec16(src_header);
1925       }
1926       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1927       insn->header.destreg__conditionalmod = mrf.nr;
1928
1929       /* Until gen6, writes followed by reads from the same location
1930        * are not guaranteed to be ordered unless write_commit is set.
1931        * If set, then a no-op write is issued to the destination
1932        * register to set a dependency, and a read from the destination
1933        * can be used to ensure the ordering.
1934        *
1935        * For gen6, only writes between different threads need ordering
1936        * protection.  Our use of DP writes is all about register
1937        * spilling within a thread.
1938        */
1939       if (brw->gen >= 6) {
1940          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1941          send_commit_msg = 0;
1942       } else {
1943          dest = src_header;
1944          send_commit_msg = 1;
1945       }
1946
1947       brw_set_dest(p, insn, dest);
1948       if (brw->gen >= 6) {
1949          brw_set_src0(p, insn, mrf);
1950       } else {
1951          brw_set_src0(p, insn, brw_null_reg());
1952       }
1953
1954       if (brw->gen >= 6)
1955          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1956       else
1957          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1958
1959       brw_set_dp_write_message(p,
1960                                insn,
1961                                255, /* binding table index (255=stateless) */
1962                                msg_control,
1963                                msg_type,
1964                                mlen,
1965                                true, /* header_present */
1966                                0, /* not a render target */
1967                                send_commit_msg, /* response_length */
1968                                0, /* eot */
1969                                send_commit_msg);
1970    }
1971 }
1972
1973
1974 /**
1975  * Read a block of owords (half a GRF each) from the scratch buffer
1976  * using a constant index per channel.
1977  *
1978  * Offset must be aligned to oword size (16 bytes).  Used for register
1979  * spilling.
1980  */
1981 void
1982 brw_oword_block_read_scratch(struct brw_compile *p,
1983                              struct brw_reg dest,
1984                              struct brw_reg mrf,
1985                              int num_regs,
1986                              GLuint offset)
1987 {
1988    struct brw_context *brw = p->brw;
1989    uint32_t msg_control;
1990    int rlen;
1991
1992    if (brw->gen >= 6)
1993       offset /= 16;
1994
1995    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1996    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1997
1998    if (num_regs == 1) {
1999       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2000       rlen = 1;
2001    } else {
2002       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2003       rlen = 2;
2004    }
2005
2006    {
2007       brw_push_insn_state(p);
2008       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2009       brw_set_mask_control(p, BRW_MASK_DISABLE);
2010
2011       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2012
2013       /* set message header global offset field (reg 0, element 2) */
2014       brw_MOV(p,
2015               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2016                                   mrf.nr,
2017                                   2), BRW_REGISTER_TYPE_UD),
2018               brw_imm_ud(offset));
2019
2020       brw_pop_insn_state(p);
2021    }
2022
2023    {
2024       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2025
2026       assert(insn->header.predicate_control == 0);
2027       insn->header.compression_control = BRW_COMPRESSION_NONE;
2028       insn->header.destreg__conditionalmod = mrf.nr;
2029
2030       brw_set_dest(p, insn, dest);      /* UW? */
2031       if (brw->gen >= 6) {
2032          brw_set_src0(p, insn, mrf);
2033       } else {
2034          brw_set_src0(p, insn, brw_null_reg());
2035       }
2036
2037       brw_set_dp_read_message(p,
2038                               insn,
2039                               255, /* binding table index (255=stateless) */
2040                               msg_control,
2041                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2042                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2043                               1, /* msg_length */
2044                               true, /* header_present */
2045                               rlen);
2046    }
2047 }
2048
2049 /**
2050  * Read a float[4] vector from the data port Data Cache (const buffer).
2051  * Location (in buffer) should be a multiple of 16.
2052  * Used for fetching shader constants.
2053  */
2054 void brw_oword_block_read(struct brw_compile *p,
2055                           struct brw_reg dest,
2056                           struct brw_reg mrf,
2057                           uint32_t offset,
2058                           uint32_t bind_table_index)
2059 {
2060    struct brw_context *brw = p->brw;
2061
2062    /* On newer hardware, offset is in units of owords. */
2063    if (brw->gen >= 6)
2064       offset /= 16;
2065
2066    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2067
2068    brw_push_insn_state(p);
2069    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2070    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2071    brw_set_mask_control(p, BRW_MASK_DISABLE);
2072
2073    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2074
2075    /* set message header global offset field (reg 0, element 2) */
2076    brw_MOV(p,
2077            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2078                                mrf.nr,
2079                                2), BRW_REGISTER_TYPE_UD),
2080            brw_imm_ud(offset));
2081
2082    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2083    insn->header.destreg__conditionalmod = mrf.nr;
2084
2085    /* cast dest to a uword[8] vector */
2086    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2087
2088    brw_set_dest(p, insn, dest);
2089    if (brw->gen >= 6) {
2090       brw_set_src0(p, insn, mrf);
2091    } else {
2092       brw_set_src0(p, insn, brw_null_reg());
2093    }
2094
2095    brw_set_dp_read_message(p,
2096                            insn,
2097                            bind_table_index,
2098                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2099                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2100                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2101                            1, /* msg_length */
2102                            true, /* header_present */
2103                            1); /* response_length (1 reg, 2 owords!) */
2104
2105    brw_pop_insn_state(p);
2106 }
2107
2108
2109 void brw_fb_WRITE(struct brw_compile *p,
2110                   int dispatch_width,
2111                   GLuint msg_reg_nr,
2112                   struct brw_reg src0,
2113                   GLuint msg_control,
2114                   GLuint binding_table_index,
2115                   GLuint msg_length,
2116                   GLuint response_length,
2117                   bool eot,
2118                   bool header_present)
2119 {
2120    struct brw_context *brw = p->brw;
2121    struct brw_instruction *insn;
2122    GLuint msg_type;
2123    struct brw_reg dest;
2124
2125    if (dispatch_width == 16)
2126       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2127    else
2128       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2129
2130    if (brw->gen >= 6) {
2131       insn = next_insn(p, BRW_OPCODE_SENDC);
2132    } else {
2133       insn = next_insn(p, BRW_OPCODE_SEND);
2134    }
2135    /* The execution mask is ignored for render target writes. */
2136    insn->header.predicate_control = 0;
2137    insn->header.compression_control = BRW_COMPRESSION_NONE;
2138
2139    if (brw->gen >= 6) {
2140       /* headerless version, just submit color payload */
2141       src0 = brw_message_reg(msg_reg_nr);
2142
2143       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2144    } else {
2145       insn->header.destreg__conditionalmod = msg_reg_nr;
2146
2147       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2148    }
2149
2150    brw_set_dest(p, insn, dest);
2151    brw_set_src0(p, insn, src0);
2152    brw_set_dp_write_message(p,
2153                             insn,
2154                             binding_table_index,
2155                             msg_control,
2156                             msg_type,
2157                             msg_length,
2158                             header_present,
2159                             eot, /* last render target write */
2160                             response_length,
2161                             eot,
2162                             0 /* send_commit_msg */);
2163 }
2164
2165
2166 /**
2167  * Texture sample instruction.
2168  * Note: the msg_type plus msg_length values determine exactly what kind
2169  * of sampling operation is performed.  See volume 4, page 161 of docs.
2170  */
2171 void brw_SAMPLE(struct brw_compile *p,
2172                 struct brw_reg dest,
2173                 GLuint msg_reg_nr,
2174                 struct brw_reg src0,
2175                 GLuint binding_table_index,
2176                 GLuint sampler,
2177                 GLuint msg_type,
2178                 GLuint response_length,
2179                 GLuint msg_length,
2180                 GLuint header_present,
2181                 GLuint simd_mode,
2182                 GLuint return_format)
2183 {
2184    struct brw_context *brw = p->brw;
2185    struct brw_instruction *insn;
2186
2187    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2188
2189    insn = next_insn(p, BRW_OPCODE_SEND);
2190    insn->header.predicate_control = 0; /* XXX */
2191    insn->header.compression_control = BRW_COMPRESSION_NONE;
2192    if (brw->gen < 6)
2193       insn->header.destreg__conditionalmod = msg_reg_nr;
2194
2195    brw_set_dest(p, insn, dest);
2196    brw_set_src0(p, insn, src0);
2197    brw_set_sampler_message(p, insn,
2198                            binding_table_index,
2199                            sampler,
2200                            msg_type,
2201                            response_length,
2202                            msg_length,
2203                            header_present,
2204                            simd_mode,
2205                            return_format);
2206 }
2207
2208 /* All these variables are pretty confusing - we might be better off
2209  * using bitmasks and macros for this, in the old style.  Or perhaps
2210  * just having the caller instantiate the fields in dword3 itself.
2211  */
2212 void brw_urb_WRITE(struct brw_compile *p,
2213                    struct brw_reg dest,
2214                    GLuint msg_reg_nr,
2215                    struct brw_reg src0,
2216                    enum brw_urb_write_flags flags,
2217                    GLuint msg_length,
2218                    GLuint response_length,
2219                    GLuint offset,
2220                    GLuint swizzle)
2221 {
2222    struct brw_context *brw = p->brw;
2223    struct brw_instruction *insn;
2224
2225    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2226
2227    if (brw->gen == 7) {
2228       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2229       brw_push_insn_state(p);
2230       brw_set_access_mode(p, BRW_ALIGN_1);
2231       brw_set_mask_control(p, BRW_MASK_DISABLE);
2232       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2233                        BRW_REGISTER_TYPE_UD),
2234                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2235                 brw_imm_ud(0xff00));
2236       brw_pop_insn_state(p);
2237    }
2238
2239    insn = next_insn(p, BRW_OPCODE_SEND);
2240
2241    assert(msg_length < BRW_MAX_MRF);
2242
2243    brw_set_dest(p, insn, dest);
2244    brw_set_src0(p, insn, src0);
2245    brw_set_src1(p, insn, brw_imm_d(0));
2246
2247    if (brw->gen < 6)
2248       insn->header.destreg__conditionalmod = msg_reg_nr;
2249
2250    brw_set_urb_message(p,
2251                        insn,
2252                        flags,
2253                        msg_length,
2254                        response_length,
2255                        offset,
2256                        swizzle);
2257 }
2258
2259 static int
2260 next_ip(struct brw_compile *p, int ip)
2261 {
2262    struct brw_instruction *insn = (void *)p->store + ip;
2263
2264    if (insn->header.cmpt_control)
2265       return ip + 8;
2266    else
2267       return ip + 16;
2268 }
2269
2270 static int
2271 brw_find_next_block_end(struct brw_compile *p, int start)
2272 {
2273    int ip;
2274    void *store = p->store;
2275
2276    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2277       struct brw_instruction *insn = store + ip;
2278
2279       switch (insn->header.opcode) {
2280       case BRW_OPCODE_ENDIF:
2281       case BRW_OPCODE_ELSE:
2282       case BRW_OPCODE_WHILE:
2283       case BRW_OPCODE_HALT:
2284          return ip;
2285       }
2286    }
2287
2288    return 0;
2289 }
2290
2291 /* There is no DO instruction on gen6, so to find the end of the loop
2292  * we have to see if the loop is jumping back before our start
2293  * instruction.
2294  */
2295 static int
2296 brw_find_loop_end(struct brw_compile *p, int start)
2297 {
2298    struct brw_context *brw = p->brw;
2299    int ip;
2300    int scale = 8;
2301    void *store = p->store;
2302
2303    /* Always start after the instruction (such as a WHILE) we're trying to fix
2304     * up.
2305     */
2306    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2307       struct brw_instruction *insn = store + ip;
2308
2309       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2310          int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2311                                    : insn->bits3.break_cont.jip;
2312          if (ip + jip * scale <= start)
2313             return ip;
2314       }
2315    }
2316    assert(!"not reached");
2317    return start;
2318 }
2319
2320 /* After program generation, go back and update the UIP and JIP of
2321  * BREAK, CONT, and HALT instructions to their correct locations.
2322  */
2323 void
2324 brw_set_uip_jip(struct brw_compile *p)
2325 {
2326    struct brw_context *brw = p->brw;
2327    int ip;
2328    int scale = 8;
2329    void *store = p->store;
2330
2331    if (brw->gen < 6)
2332       return;
2333
2334    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2335       struct brw_instruction *insn = store + ip;
2336
2337       if (insn->header.cmpt_control) {
2338          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2339          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2340                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2341                 insn->header.opcode != BRW_OPCODE_HALT);
2342          continue;
2343       }
2344
2345       int block_end_ip = brw_find_next_block_end(p, ip);
2346       switch (insn->header.opcode) {
2347       case BRW_OPCODE_BREAK:
2348          assert(block_end_ip != 0);
2349          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2350          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2351          insn->bits3.break_cont.uip =
2352             (brw_find_loop_end(p, ip) - ip +
2353              (brw->gen == 6 ? 16 : 0)) / scale;
2354          break;
2355       case BRW_OPCODE_CONTINUE:
2356          assert(block_end_ip != 0);
2357          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2358          insn->bits3.break_cont.uip =
2359             (brw_find_loop_end(p, ip) - ip) / scale;
2360
2361          assert(insn->bits3.break_cont.uip != 0);
2362          assert(insn->bits3.break_cont.jip != 0);
2363          break;
2364
2365       case BRW_OPCODE_ENDIF:
2366          if (block_end_ip == 0)
2367             insn->bits3.break_cont.jip = 2;
2368          else
2369             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2370          break;
2371
2372       case BRW_OPCODE_HALT:
2373          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2374           *
2375           *    "In case of the halt instruction not inside any conditional
2376           *     code block, the value of <JIP> and <UIP> should be the
2377           *     same. In case of the halt instruction inside conditional code
2378           *     block, the <UIP> should be the end of the program, and the
2379           *     <JIP> should be end of the most inner conditional code block."
2380           *
2381           * The uip will have already been set by whoever set up the
2382           * instruction.
2383           */
2384          if (block_end_ip == 0) {
2385             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2386          } else {
2387             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2388          }
2389          assert(insn->bits3.break_cont.uip != 0);
2390          assert(insn->bits3.break_cont.jip != 0);
2391          break;
2392       }
2393    }
2394 }
2395
2396 void brw_ff_sync(struct brw_compile *p,
2397                    struct brw_reg dest,
2398                    GLuint msg_reg_nr,
2399                    struct brw_reg src0,
2400                    bool allocate,
2401                    GLuint response_length,
2402                    bool eot)
2403 {
2404    struct brw_context *brw = p->brw;
2405    struct brw_instruction *insn;
2406
2407    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2408
2409    insn = next_insn(p, BRW_OPCODE_SEND);
2410    brw_set_dest(p, insn, dest);
2411    brw_set_src0(p, insn, src0);
2412    brw_set_src1(p, insn, brw_imm_d(0));
2413
2414    if (brw->gen < 6)
2415       insn->header.destreg__conditionalmod = msg_reg_nr;
2416
2417    brw_set_ff_sync_message(p,
2418                            insn,
2419                            allocate,
2420                            response_length,
2421                            eot);
2422 }
2423
2424 /**
2425  * Emit the SEND instruction necessary to generate stream output data on Gen6
2426  * (for transform feedback).
2427  *
2428  * If send_commit_msg is true, this is the last piece of stream output data
2429  * from this thread, so send the data as a committed write.  According to the
2430  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2431  *
2432  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2433  *   writes are complete by sending the final write as a committed write."
2434  */
2435 void
2436 brw_svb_write(struct brw_compile *p,
2437               struct brw_reg dest,
2438               GLuint msg_reg_nr,
2439               struct brw_reg src0,
2440               GLuint binding_table_index,
2441               bool   send_commit_msg)
2442 {
2443    struct brw_instruction *insn;
2444
2445    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2446
2447    insn = next_insn(p, BRW_OPCODE_SEND);
2448    brw_set_dest(p, insn, dest);
2449    brw_set_src0(p, insn, src0);
2450    brw_set_src1(p, insn, brw_imm_d(0));
2451    brw_set_dp_write_message(p, insn,
2452                             binding_table_index,
2453                             0, /* msg_control: ignored */
2454                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2455                             1, /* msg_length */
2456                             true, /* header_present */
2457                             0, /* last_render_target: ignored */
2458                             send_commit_msg, /* response_length */
2459                             0, /* end_of_thread */
2460                             send_commit_msg); /* send_commit_msg */
2461 }
2462
2463 /**
2464  * This instruction is generated as a single-channel align1 instruction by
2465  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2466  *
2467  * We can't use the typed atomic op in the FS because that has the execution
2468  * mask ANDed with the pixel mask, but we just want to write the one dword for
2469  * all the pixels.
2470  *
2471  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2472  * one u32.  So we use the same untyped atomic write message as the pixel
2473  * shader.
2474  *
2475  * The untyped atomic operation requires a BUFFER surface type with RAW
2476  * format, and is only accessible through the legacy DATA_CACHE dataport
2477  * messages.
2478  */
2479 void brw_shader_time_add(struct brw_compile *p,
2480                          struct brw_reg payload,
2481                          uint32_t surf_index)
2482 {
2483    struct brw_context *brw = p->brw;
2484    assert(brw->gen >= 7);
2485
2486    brw_push_insn_state(p);
2487    brw_set_access_mode(p, BRW_ALIGN_1);
2488    brw_set_mask_control(p, BRW_MASK_DISABLE);
2489    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2490    brw_pop_insn_state(p);
2491
2492    /* We use brw_vec1_reg and unmasked because we want to increment the given
2493     * offset only once.
2494     */
2495    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2496                                       BRW_ARF_NULL, 0));
2497    brw_set_src0(p, send, brw_vec1_reg(payload.file,
2498                                       payload.nr, 0));
2499
2500    uint32_t sfid, msg_type;
2501    if (brw->is_haswell) {
2502       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2503       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2504    } else {
2505       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2506       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2507    }
2508
2509    bool header_present = false;
2510    bool eot = false;
2511    uint32_t mlen = 2; /* offset, value */
2512    uint32_t rlen = 0;
2513    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2514
2515    send->bits3.ud |= msg_type << 14;
2516    send->bits3.ud |= 0 << 13; /* no return data */
2517    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2518    send->bits3.ud |= BRW_AOP_ADD << 8;
2519    send->bits3.ud |= surf_index << 0;
2520 }