src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct brw_context *brw = p->brw;
  67    if (brw->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct brw_context *brw = p->brw;
  96    if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 131           *    this to be programmed as "01".
 132           */
 133          insn->bits1.da16.dest_horiz_stride = 1;
 134       }
 135    }
 136    else {
 137       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 138
 139       /* These are different sizes in align1 vs align16:
 140        */
 141       if (insn->header.access_mode == BRW_ALIGN_1) {
 142          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 143          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 144             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 145          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 146       }
 147       else {
 148          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 149          /* even ignored in da16, still need to set as '01' */
 150          insn->bits1.ia16.dest_horiz_stride = 1;
 151       }
 152    }
 153
 154    /* NEW: Set the execution size based on dest.width and
 155     * insn->compression_control:
 156     */
 157    guess_execution_size(p, insn, dest);
 158 }
 159
 160 extern int reg_type_size[];
 161
 162 static void
 163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 164 {
 165    int hstride_for_reg[] = {0, 1, 2, 4};
 166    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 167    int width_for_reg[] = {1, 2, 4, 8, 16};
 168    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 169    int width, hstride, vstride, execsize;
 170
 171    if (reg.file == BRW_IMMEDIATE_VALUE) {
 172       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 173        * mean the destination has to be 128-bit aligned and the
 174        * destination horiz stride has to be a word.
 175        */
 176       if (reg.type == BRW_REGISTER_TYPE_V) {
 177          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 178                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 179       }
 180
 181       return;
 182    }
 183
 184    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 185        reg.file == BRW_ARF_NULL)
 186       return;
 187
 188    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 189    hstride = hstride_for_reg[reg.hstride];
 190
 191    if (reg.vstride == 0xf) {
 192       vstride = -1;
 193    } else {
 194       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 195       vstride = vstride_for_reg[reg.vstride];
 196    }
 197
 198    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 199    width = width_for_reg[reg.width];
 200
 201    assert(insn->header.execution_size >= 0 &&
 202           insn->header.execution_size < Elements(execsize_for_reg));
 203    execsize = execsize_for_reg[insn->header.execution_size];
 204
 205    /* Restrictions from 3.3.10: Register Region Restrictions. */
 206    /* 3. */
 207    assert(execsize >= width);
 208
 209    /* 4. */
 210    if (execsize == width && hstride != 0) {
 211       assert(vstride == -1 || vstride == width * hstride);
 212    }
 213
 214    /* 5. */
 215    if (execsize == width && hstride == 0) {
 216       /* no restriction on vstride. */
 217    }
 218
 219    /* 6. */
 220    if (width == 1) {
 221       assert(hstride == 0);
 222    }
 223
 224    /* 7. */
 225    if (execsize == 1 && width == 1) {
 226       assert(hstride == 0);
 227       assert(vstride == 0);
 228    }
 229
 230    /* 8. */
 231    if (vstride == 0 && hstride == 0) {
 232       assert(width == 1);
 233    }
 234
 235    /* 10. Check destination issues. */
 236 }
 237
 238 void
 239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 240              struct brw_reg reg)
 241 {
 242    struct brw_context *brw = p->brw;
 243
 244    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 245       assert(reg.nr < 128);
 246
 247    gen7_convert_mrf_to_grf(p, &reg);
 248
 249    if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 250                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 251       /* Any source modifiers or regions will be ignored, since this just
 252        * identifies the MRF/GRF to start reading the message contents from.
 253        * Check for some likely failures.
 254        */
 255       assert(!reg.negate);
 256       assert(!reg.abs);
 257       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 258    }
 259
 260    validate_reg(insn, reg);
 261
 262    insn->bits1.da1.src0_reg_file = reg.file;
 263    insn->bits1.da1.src0_reg_type = reg.type;
 264    insn->bits2.da1.src0_abs = reg.abs;
 265    insn->bits2.da1.src0_negate = reg.negate;
 266    insn->bits2.da1.src0_address_mode = reg.address_mode;
 267
 268    if (reg.file == BRW_IMMEDIATE_VALUE) {
 269       insn->bits3.ud = reg.dw1.ud;
 270
 271       /* Required to set some fields in src1 as well:
 272        */
 273       insn->bits1.da1.src1_reg_file = 0; /* arf */
 274       insn->bits1.da1.src1_reg_type = reg.type;
 275    }
 276    else
 277    {
 278       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 279          if (insn->header.access_mode == BRW_ALIGN_1) {
 280             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 281             insn->bits2.da1.src0_reg_nr = reg.nr;
 282          }
 283          else {
 284             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 285             insn->bits2.da16.src0_reg_nr = reg.nr;
 286          }
 287       }
 288       else {
 289          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 290
 291          if (insn->header.access_mode == BRW_ALIGN_1) {
 292             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 293          }
 294          else {
 295             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 296          }
 297       }
 298
 299       if (insn->header.access_mode == BRW_ALIGN_1) {
 300          if (reg.width == BRW_WIDTH_1 &&
 301              insn->header.execution_size == BRW_EXECUTE_1) {
 302             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 303             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 304             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 305          }
 306          else {
 307             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 308             insn->bits2.da1.src0_width = reg.width;
 309             insn->bits2.da1.src0_vert_stride = reg.vstride;
 310          }
 311       }
 312       else {
 313          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 314          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 315          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 316          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 317
 318          /* This is an oddity of the fact we're using the same
 319           * descriptions for registers in align_16 as align_1:
 320           */
 321          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 322             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 323          else
 324             insn->bits2.da16.src0_vert_stride = reg.vstride;
 325       }
 326    }
 327 }
 328
 329
 330 void brw_set_src1(struct brw_compile *p,
 331                   struct brw_instruction *insn,
 332                   struct brw_reg reg)
 333 {
 334    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 335
 336    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 337       assert(reg.nr < 128);
 338
 339    gen7_convert_mrf_to_grf(p, &reg);
 340
 341    validate_reg(insn, reg);
 342
 343    insn->bits1.da1.src1_reg_file = reg.file;
 344    insn->bits1.da1.src1_reg_type = reg.type;
 345    insn->bits3.da1.src1_abs = reg.abs;
 346    insn->bits3.da1.src1_negate = reg.negate;
 347
 348    /* Only src1 can be immediate in two-argument instructions.
 349     */
 350    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 351
 352    if (reg.file == BRW_IMMEDIATE_VALUE) {
 353       insn->bits3.ud = reg.dw1.ud;
 354    }
 355    else {
 356       /* This is a hardware restriction, which may or may not be lifted
 357        * in the future:
 358        */
 359       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 360       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 361
 362       if (insn->header.access_mode == BRW_ALIGN_1) {
 363          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 364          insn->bits3.da1.src1_reg_nr = reg.nr;
 365       }
 366       else {
 367          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 368          insn->bits3.da16.src1_reg_nr = reg.nr;
 369       }
 370
 371       if (insn->header.access_mode == BRW_ALIGN_1) {
 372          if (reg.width == BRW_WIDTH_1 &&
 373              insn->header.execution_size == BRW_EXECUTE_1) {
 374             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 375             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 376             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 377          }
 378          else {
 379             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 380             insn->bits3.da1.src1_width = reg.width;
 381             insn->bits3.da1.src1_vert_stride = reg.vstride;
 382          }
 383       }
 384       else {
 385          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 386          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 387          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 388          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 389
 390          /* This is an oddity of the fact we're using the same
 391           * descriptions for registers in align_16 as align_1:
 392           */
 393          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 394             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 395          else
 396             insn->bits3.da16.src1_vert_stride = reg.vstride;
 397       }
 398    }
 399 }
 400
 401 /**
 402  * Set the Message Descriptor and Extended Message Descriptor fields
 403  * for SEND messages.
 404  *
 405  * \note This zeroes out the Function Control bits, so it must be called
 406  *       \b before filling out any message-specific data.  Callers can
 407  *       choose not to fill in irrelevant bits; they will be zero.
 408  */
 409 static void
 410 brw_set_message_descriptor(struct brw_compile *p,
 411                            struct brw_instruction *inst,
 412                            enum brw_message_target sfid,
 413                            unsigned msg_length,
 414                            unsigned response_length,
 415                            bool header_present,
 416                            bool end_of_thread)
 417 {
 418    struct brw_context *brw = p->brw;
 419
 420    brw_set_src1(p, inst, brw_imm_d(0));
 421
 422    if (brw->gen >= 5) {
 423       inst->bits3.generic_gen5.header_present = header_present;
 424       inst->bits3.generic_gen5.response_length = response_length;
 425       inst->bits3.generic_gen5.msg_length = msg_length;
 426       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 427
 428       if (brw->gen >= 6) {
 429          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 430          inst->header.destreg__conditionalmod = sfid;
 431       } else {
 432          /* Set Extended Message Descriptor (ex_desc) */
 433          inst->bits2.send_gen5.sfid = sfid;
 434          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 435       }
 436    } else {
 437       inst->bits3.generic.response_length = response_length;
 438       inst->bits3.generic.msg_length = msg_length;
 439       inst->bits3.generic.msg_target = sfid;
 440       inst->bits3.generic.end_of_thread = end_of_thread;
 441    }
 442 }
 443
 444 static void brw_set_math_message( struct brw_compile *p,
 445                                   struct brw_instruction *insn,
 446                                   GLuint function,
 447                                   GLuint integer_type,
 448                                   bool low_precision,
 449                                   GLuint dataType )
 450 {
 451    struct brw_context *brw = p->brw;
 452    unsigned msg_length;
 453    unsigned response_length;
 454
 455    /* Infer message length from the function */
 456    switch (function) {
 457    case BRW_MATH_FUNCTION_POW:
 458    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 459    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 461       msg_length = 2;
 462       break;
 463    default:
 464       msg_length = 1;
 465       break;
 466    }
 467
 468    /* Infer response length from the function */
 469    switch (function) {
 470    case BRW_MATH_FUNCTION_SINCOS:
 471    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 472       response_length = 2;
 473       break;
 474    default:
 475       response_length = 1;
 476       break;
 477    }
 478
 479
 480    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 481                               msg_length, response_length, false, false);
 482    if (brw->gen == 5) {
 483       insn->bits3.math_gen5.function = function;
 484       insn->bits3.math_gen5.int_type = integer_type;
 485       insn->bits3.math_gen5.precision = low_precision;
 486       insn->bits3.math_gen5.saturate = insn->header.saturate;
 487       insn->bits3.math_gen5.data_type = dataType;
 488       insn->bits3.math_gen5.snapshot = 0;
 489    } else {
 490       insn->bits3.math.function = function;
 491       insn->bits3.math.int_type = integer_type;
 492       insn->bits3.math.precision = low_precision;
 493       insn->bits3.math.saturate = insn->header.saturate;
 494       insn->bits3.math.data_type = dataType;
 495    }
 496    insn->header.saturate = 0;
 497 }
 498
 499
 500 static void brw_set_ff_sync_message(struct brw_compile *p,
 501                                     struct brw_instruction *insn,
 502                                     bool allocate,
 503                                     GLuint response_length,
 504                                     bool end_of_thread)
 505 {
 506    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 507                               1, response_length, true, end_of_thread);
 508    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 509    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 510    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 511    insn->bits3.urb_gen5.allocate = allocate;
 512    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 514 }
 515
 516 static void brw_set_urb_message( struct brw_compile *p,
 517                                  struct brw_instruction *insn,
 518                                  enum brw_urb_write_flags flags,
 519                                  GLuint msg_length,
 520                                  GLuint response_length,
 521                                  GLuint offset,
 522                                  GLuint swizzle_control )
 523 {
 524    struct brw_context *brw = p->brw;
 525
 526    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 527                               msg_length, response_length, true,
 528                               flags & BRW_URB_WRITE_EOT);
 529    if (brw->gen == 7) {
 530       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 531       insn->bits3.urb_gen7.offset = offset;
 532       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 533       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 534       insn->bits3.urb_gen7.per_slot_offset =
 535          flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
 536       insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 537    } else if (brw->gen >= 5) {
 538       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 539       insn->bits3.urb_gen5.offset = offset;
 540       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 541       insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 542       insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 543       insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 544    } else {
 545       insn->bits3.urb.opcode = 0;       /* ? */
 546       insn->bits3.urb.offset = offset;
 547       insn->bits3.urb.swizzle_control = swizzle_control;
 548       insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 549       insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 550       insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 551    }
 552 }
 553
 554 void
 555 brw_set_dp_write_message(struct brw_compile *p,
 556                          struct brw_instruction *insn,
 557                          GLuint binding_table_index,
 558                          GLuint msg_control,
 559                          GLuint msg_type,
 560                          GLuint msg_length,
 561                          bool header_present,
 562                          GLuint last_render_target,
 563                          GLuint response_length,
 564                          GLuint end_of_thread,
 565                          GLuint send_commit_msg)
 566 {
 567    struct brw_context *brw = p->brw;
 568    unsigned sfid;
 569
 570    if (brw->gen >= 7) {
 571       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 572       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 573          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 574       else
 575          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 576    } else if (brw->gen == 6) {
 577       /* Use the render cache for all write messages. */
 578       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 579    } else {
 580       sfid = BRW_SFID_DATAPORT_WRITE;
 581    }
 582
 583    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 584                               header_present, end_of_thread);
 585
 586    if (brw->gen >= 7) {
 587       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 588       insn->bits3.gen7_dp.msg_control = msg_control;
 589       insn->bits3.gen7_dp.last_render_target = last_render_target;
 590       insn->bits3.gen7_dp.msg_type = msg_type;
 591    } else if (brw->gen == 6) {
 592       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 593       insn->bits3.gen6_dp.msg_control = msg_control;
 594       insn->bits3.gen6_dp.last_render_target = last_render_target;
 595       insn->bits3.gen6_dp.msg_type = msg_type;
 596       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 597    } else if (brw->gen == 5) {
 598       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 599       insn->bits3.dp_write_gen5.msg_control = msg_control;
 600       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 601       insn->bits3.dp_write_gen5.msg_type = msg_type;
 602       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 603    } else {
 604       insn->bits3.dp_write.binding_table_index = binding_table_index;
 605       insn->bits3.dp_write.msg_control = msg_control;
 606       insn->bits3.dp_write.last_render_target = last_render_target;
 607       insn->bits3.dp_write.msg_type = msg_type;
 608       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 609    }
 610 }
 611
 612 void
 613 brw_set_dp_read_message(struct brw_compile *p,
 614                         struct brw_instruction *insn,
 615                         GLuint binding_table_index,
 616                         GLuint msg_control,
 617                         GLuint msg_type,
 618                         GLuint target_cache,
 619                         GLuint msg_length,
 620                         bool header_present,
 621                         GLuint response_length)
 622 {
 623    struct brw_context *brw = p->brw;
 624    unsigned sfid;
 625
 626    if (brw->gen >= 7) {
 627       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 628    } else if (brw->gen == 6) {
 629       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 630          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 631       else
 632          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 633    } else {
 634       sfid = BRW_SFID_DATAPORT_READ;
 635    }
 636
 637    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 638                               header_present, false);
 639
 640    if (brw->gen >= 7) {
 641       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 642       insn->bits3.gen7_dp.msg_control = msg_control;
 643       insn->bits3.gen7_dp.last_render_target = 0;
 644       insn->bits3.gen7_dp.msg_type = msg_type;
 645    } else if (brw->gen == 6) {
 646       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 647       insn->bits3.gen6_dp.msg_control = msg_control;
 648       insn->bits3.gen6_dp.last_render_target = 0;
 649       insn->bits3.gen6_dp.msg_type = msg_type;
 650       insn->bits3.gen6_dp.send_commit_msg = 0;
 651    } else if (brw->gen == 5) {
 652       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 653       insn->bits3.dp_read_gen5.msg_control = msg_control;
 654       insn->bits3.dp_read_gen5.msg_type = msg_type;
 655       insn->bits3.dp_read_gen5.target_cache = target_cache;
 656    } else if (brw->is_g4x) {
 657       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 658       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 659       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 660       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 661    } else {
 662       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 663       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 664       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 665       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 666    }
 667 }
 668
 669 void
 670 brw_set_sampler_message(struct brw_compile *p,
 671                         struct brw_instruction *insn,
 672                         GLuint binding_table_index,
 673                         GLuint sampler,
 674                         GLuint msg_type,
 675                         GLuint response_length,
 676                         GLuint msg_length,
 677                         GLuint header_present,
 678                         GLuint simd_mode,
 679                         GLuint return_format)
 680 {
 681    struct brw_context *brw = p->brw;
 682
 683    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 684                               response_length, header_present, false);
 685
 686    if (brw->gen >= 7) {
 687       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 688       insn->bits3.sampler_gen7.sampler = sampler;
 689       insn->bits3.sampler_gen7.msg_type = msg_type;
 690       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 691    } else if (brw->gen >= 5) {
 692       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 693       insn->bits3.sampler_gen5.sampler = sampler;
 694       insn->bits3.sampler_gen5.msg_type = msg_type;
 695       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 696    } else if (brw->is_g4x) {
 697       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 698       insn->bits3.sampler_g4x.sampler = sampler;
 699       insn->bits3.sampler_g4x.msg_type = msg_type;
 700    } else {
 701       insn->bits3.sampler.binding_table_index = binding_table_index;
 702       insn->bits3.sampler.sampler = sampler;
 703       insn->bits3.sampler.msg_type = msg_type;
 704       insn->bits3.sampler.return_format = return_format;
 705    }
 706 }
 707
 708
 709 #define next_insn brw_next_insn
 710 struct brw_instruction *
 711 brw_next_insn(struct brw_compile *p, GLuint opcode)
 712 {
 713    struct brw_instruction *insn;
 714
 715    if (p->nr_insn + 1 > p->store_size) {
 716       if (0)
 717          printf("incresing the store size to %d\n", p->store_size << 1);
 718       p->store_size <<= 1;
 719       p->store = reralloc(p->mem_ctx, p->store,
 720                           struct brw_instruction, p->store_size);
 721       if (!p->store)
 722          assert(!"realloc eu store memeory failed");
 723    }
 724
 725    p->next_insn_offset += 16;
 726    insn = &p->store[p->nr_insn++];
 727    memcpy(insn, p->current, sizeof(*insn));
 728
 729    /* Reset this one-shot flag:
 730     */
 731
 732    if (p->current->header.destreg__conditionalmod) {
 733       p->current->header.destreg__conditionalmod = 0;
 734       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 735    }
 736
 737    insn->header.opcode = opcode;
 738    return insn;
 739 }
 740
 741 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 742                                          GLuint opcode,
 743                                          struct brw_reg dest,
 744                                          struct brw_reg src )
 745 {
 746    struct brw_instruction *insn = next_insn(p, opcode);
 747    brw_set_dest(p, insn, dest);
 748    brw_set_src0(p, insn, src);
 749    return insn;
 750 }
 751
 752 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 753                                         GLuint opcode,
 754                                         struct brw_reg dest,
 755                                         struct brw_reg src0,
 756                                         struct brw_reg src1 )
 757 {
 758    struct brw_instruction *insn = next_insn(p, opcode);
 759    brw_set_dest(p, insn, dest);
 760    brw_set_src0(p, insn, src0);
 761    brw_set_src1(p, insn, src1);
 762    return insn;
 763 }
 764
 765 static int
 766 get_3src_subreg_nr(struct brw_reg reg)
 767 {
 768    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 769       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 770       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 771    } else {
 772       return reg.subnr / 4;
 773    }
 774 }
 775
 776 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 777                                         GLuint opcode,
 778                                         struct brw_reg dest,
 779                                         struct brw_reg src0,
 780                                         struct brw_reg src1,
 781                                         struct brw_reg src2)
 782 {
 783    struct brw_context *brw = p->brw;
 784    struct brw_instruction *insn = next_insn(p, opcode);
 785
 786    gen7_convert_mrf_to_grf(p, &dest);
 787
 788    assert(insn->header.access_mode == BRW_ALIGN_16);
 789
 790    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 791           dest.file == BRW_MESSAGE_REGISTER_FILE);
 792    assert(dest.nr < 128);
 793    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 794    assert(dest.type == BRW_REGISTER_TYPE_F ||
 795           dest.type == BRW_REGISTER_TYPE_D ||
 796           dest.type == BRW_REGISTER_TYPE_UD);
 797    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 798    insn->bits1.da3src.dest_reg_nr = dest.nr;
 799    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 800    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 801    guess_execution_size(p, insn, dest);
 802
 803    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 804    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 805    assert(src0.nr < 128);
 806    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 807    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 808    insn->bits2.da3src.src0_reg_nr = src0.nr;
 809    insn->bits1.da3src.src0_abs = src0.abs;
 810    insn->bits1.da3src.src0_negate = src0.negate;
 811    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 812
 813    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 814    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 815    assert(src1.nr < 128);
 816    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 817    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 818    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 819    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 820    insn->bits3.da3src.src1_reg_nr = src1.nr;
 821    insn->bits1.da3src.src1_abs = src1.abs;
 822    insn->bits1.da3src.src1_negate = src1.negate;
 823
 824    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 825    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 826    assert(src2.nr < 128);
 827    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 828    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 829    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 830    insn->bits3.da3src.src2_reg_nr = src2.nr;
 831    insn->bits1.da3src.src2_abs = src2.abs;
 832    insn->bits1.da3src.src2_negate = src2.negate;
 833
 834    if (brw->gen >= 7) {
 835       /* Set both the source and destination types based on dest.type,
 836        * ignoring the source register types.  The MAD and LRP emitters ensure
 837        * that all four types are float.  The BFE and BFI2 emitters, however,
 838        * may send us mixed D and UD types and want us to ignore that and use
 839        * the destination type.
 840        */
 841       switch (dest.type) {
 842       case BRW_REGISTER_TYPE_F:
 843          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
 844          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
 845          break;
 846       case BRW_REGISTER_TYPE_D:
 847          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
 848          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
 849          break;
 850       case BRW_REGISTER_TYPE_UD:
 851          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
 852          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
 853          break;
 854       }
 855    }
 856
 857    return insn;
 858 }
 859
 860
 861 /***********************************************************************
 862  * Convenience routines.
 863  */
 864 #define ALU1(OP)                                        \
 865 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 866               struct brw_reg dest,                      \
 867               struct brw_reg src0)                      \
 868 {                                                       \
 869    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 870 }
 871
 872 #define ALU2(OP)                                        \
 873 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 874               struct brw_reg dest,                      \
 875               struct brw_reg src0,                      \
 876               struct brw_reg src1)                      \
 877 {                                                       \
 878    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 879 }
 880
 881 #define ALU3(OP)                                        \
 882 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 883               struct brw_reg dest,                      \
 884               struct brw_reg src0,                      \
 885               struct brw_reg src1,                      \
 886               struct brw_reg src2)                      \
 887 {                                                       \
 888    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 889 }
 890
 891 #define ALU3F(OP)                                               \
 892 struct brw_instruction *brw_##OP(struct brw_compile *p,         \
 893                                  struct brw_reg dest,           \
 894                                  struct brw_reg src0,           \
 895                                  struct brw_reg src1,           \
 896                                  struct brw_reg src2)           \
 897 {                                                               \
 898    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
 899    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
 900    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
 901    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
 902    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 903 }
 904
 905 /* Rounding operations (other than RNDD) require two instructions - the first
 906  * stores a rounded value (possibly the wrong way) in the dest register, but
 907  * also sets a per-channel "increment bit" in the flag register.  A predicated
 908  * add of 1.0 fixes dest to contain the desired result.
 909  *
 910  * Sandybridge and later appear to round correctly without an ADD.
 911  */
 912 #define ROUND(OP)                                                             \
 913 void brw_##OP(struct brw_compile *p,                                          \
 914               struct brw_reg dest,                                            \
 915               struct brw_reg src)                                             \
 916 {                                                                             \
 917    struct brw_instruction *rnd, *add;                                         \
 918    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 919    brw_set_dest(p, rnd, dest);                                                \
 920    brw_set_src0(p, rnd, src);                                                 \
 921                                                                               \
 922    if (p->brw->gen < 6) {                                                     \
 923       /* turn on round-increments */                                          \
 924       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 925       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 926       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 927    }                                                                          \
 928 }
 929
 930
 931 ALU1(MOV)
 932 ALU2(SEL)
 933 ALU1(NOT)
 934 ALU2(AND)
 935 ALU2(OR)
 936 ALU2(XOR)
 937 ALU2(SHR)
 938 ALU2(SHL)
 939 ALU2(ASR)
 940 ALU1(F32TO16)
 941 ALU1(F16TO32)
 942 ALU1(FRC)
 943 ALU1(RNDD)
 944 ALU2(MAC)
 945 ALU2(MACH)
 946 ALU1(LZD)
 947 ALU2(DP4)
 948 ALU2(DPH)
 949 ALU2(DP3)
 950 ALU2(DP2)
 951 ALU2(LINE)
 952 ALU2(PLN)
 953 ALU3F(MAD)
 954 ALU3F(LRP)
 955 ALU1(BFREV)
 956 ALU3(BFE)
 957 ALU2(BFI1)
 958 ALU3(BFI2)
 959 ALU1(FBH)
 960 ALU1(FBL)
 961 ALU1(CBIT)
 962
 963 ROUND(RNDZ)
 964 ROUND(RNDE)
 965
 966
 967 struct brw_instruction *brw_ADD(struct brw_compile *p,
 968                                 struct brw_reg dest,
 969                                 struct brw_reg src0,
 970                                 struct brw_reg src1)
 971 {
 972    /* 6.2.2: add */
 973    if (src0.type == BRW_REGISTER_TYPE_F ||
 974        (src0.file == BRW_IMMEDIATE_VALUE &&
 975         src0.type == BRW_REGISTER_TYPE_VF)) {
 976       assert(src1.type != BRW_REGISTER_TYPE_UD);
 977       assert(src1.type != BRW_REGISTER_TYPE_D);
 978    }
 979
 980    if (src1.type == BRW_REGISTER_TYPE_F ||
 981        (src1.file == BRW_IMMEDIATE_VALUE &&
 982         src1.type == BRW_REGISTER_TYPE_VF)) {
 983       assert(src0.type != BRW_REGISTER_TYPE_UD);
 984       assert(src0.type != BRW_REGISTER_TYPE_D);
 985    }
 986
 987    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 988 }
 989
 990 struct brw_instruction *brw_AVG(struct brw_compile *p,
 991                                 struct brw_reg dest,
 992                                 struct brw_reg src0,
 993                                 struct brw_reg src1)
 994 {
 995    assert(dest.type == src0.type);
 996    assert(src0.type == src1.type);
 997    switch (src0.type) {
 998    case BRW_REGISTER_TYPE_B:
 999    case BRW_REGISTER_TYPE_UB:
1000    case BRW_REGISTER_TYPE_W:
1001    case BRW_REGISTER_TYPE_UW:
1002    case BRW_REGISTER_TYPE_D:
1003    case BRW_REGISTER_TYPE_UD:
1004       break;
1005    default:
1006       assert(!"Bad type for brw_AVG");
1007    }
1008
1009    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1010 }
1011
1012 struct brw_instruction *brw_MUL(struct brw_compile *p,
1013                                 struct brw_reg dest,
1014                                 struct brw_reg src0,
1015                                 struct brw_reg src1)
1016 {
1017    /* 6.32.38: mul */
1018    if (src0.type == BRW_REGISTER_TYPE_D ||
1019        src0.type == BRW_REGISTER_TYPE_UD ||
1020        src1.type == BRW_REGISTER_TYPE_D ||
1021        src1.type == BRW_REGISTER_TYPE_UD) {
1022       assert(dest.type != BRW_REGISTER_TYPE_F);
1023    }
1024
1025    if (src0.type == BRW_REGISTER_TYPE_F ||
1026        (src0.file == BRW_IMMEDIATE_VALUE &&
1027         src0.type == BRW_REGISTER_TYPE_VF)) {
1028       assert(src1.type != BRW_REGISTER_TYPE_UD);
1029       assert(src1.type != BRW_REGISTER_TYPE_D);
1030    }
1031
1032    if (src1.type == BRW_REGISTER_TYPE_F ||
1033        (src1.file == BRW_IMMEDIATE_VALUE &&
1034         src1.type == BRW_REGISTER_TYPE_VF)) {
1035       assert(src0.type != BRW_REGISTER_TYPE_UD);
1036       assert(src0.type != BRW_REGISTER_TYPE_D);
1037    }
1038
1039    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1040           src0.nr != BRW_ARF_ACCUMULATOR);
1041    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1042           src1.nr != BRW_ARF_ACCUMULATOR);
1043
1044    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1045 }
1046
1047
1048 void brw_NOP(struct brw_compile *p)
1049 {
1050    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1051    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1052    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1053    brw_set_src1(p, insn, brw_imm_ud(0x0));
1054 }
1055
1056
1057
1058
1059
1060 /***********************************************************************
1061  * Comparisons, if/else/endif
1062  */
1063
1064 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1065                                  struct brw_reg dest,
1066                                  struct brw_reg src0,
1067                                  struct brw_reg src1)
1068 {
1069    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1070
1071    insn->header.execution_size = 1;
1072    insn->header.compression_control = BRW_COMPRESSION_NONE;
1073    insn->header.mask_control = BRW_MASK_DISABLE;
1074
1075    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1076
1077    return insn;
1078 }
1079
1080 static void
1081 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1082 {
1083    p->if_stack[p->if_stack_depth] = inst - p->store;
1084
1085    p->if_stack_depth++;
1086    if (p->if_stack_array_size <= p->if_stack_depth) {
1087       p->if_stack_array_size *= 2;
1088       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1089                              p->if_stack_array_size);
1090    }
1091 }
1092
1093 static struct brw_instruction *
1094 pop_if_stack(struct brw_compile *p)
1095 {
1096    p->if_stack_depth--;
1097    return &p->store[p->if_stack[p->if_stack_depth]];
1098 }
1099
1100 static void
1101 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1102 {
1103    if (p->loop_stack_array_size < p->loop_stack_depth) {
1104       p->loop_stack_array_size *= 2;
1105       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1106                                p->loop_stack_array_size);
1107       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1108                                      p->loop_stack_array_size);
1109    }
1110
1111    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1112    p->loop_stack_depth++;
1113    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1114 }
1115
1116 static struct brw_instruction *
1117 get_inner_do_insn(struct brw_compile *p)
1118 {
1119    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1120 }
1121
1122 /* EU takes the value from the flag register and pushes it onto some
1123  * sort of a stack (presumably merging with any flag value already on
1124  * the stack).  Within an if block, the flags at the top of the stack
1125  * control execution on each channel of the unit, eg. on each of the
1126  * 16 pixel values in our wm programs.
1127  *
1128  * When the matching 'else' instruction is reached (presumably by
1129  * countdown of the instruction count patched in by our ELSE/ENDIF
1130  * functions), the relevent flags are inverted.
1131  *
1132  * When the matching 'endif' instruction is reached, the flags are
1133  * popped off.  If the stack is now empty, normal execution resumes.
1134  */
1135 struct brw_instruction *
1136 brw_IF(struct brw_compile *p, GLuint execute_size)
1137 {
1138    struct brw_context *brw = p->brw;
1139    struct brw_instruction *insn;
1140
1141    insn = next_insn(p, BRW_OPCODE_IF);
1142
1143    /* Override the defaults for this instruction:
1144     */
1145    if (brw->gen < 6) {
1146       brw_set_dest(p, insn, brw_ip_reg());
1147       brw_set_src0(p, insn, brw_ip_reg());
1148       brw_set_src1(p, insn, brw_imm_d(0x0));
1149    } else if (brw->gen == 6) {
1150       brw_set_dest(p, insn, brw_imm_w(0));
1151       insn->bits1.branch_gen6.jump_count = 0;
1152       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1153       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1154    } else {
1155       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1156       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1157       brw_set_src1(p, insn, brw_imm_ud(0));
1158       insn->bits3.break_cont.jip = 0;
1159       insn->bits3.break_cont.uip = 0;
1160    }
1161
1162    insn->header.execution_size = execute_size;
1163    insn->header.compression_control = BRW_COMPRESSION_NONE;
1164    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1165    insn->header.mask_control = BRW_MASK_ENABLE;
1166    if (!p->single_program_flow)
1167       insn->header.thread_control = BRW_THREAD_SWITCH;
1168
1169    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1170
1171    push_if_stack(p, insn);
1172    p->if_depth_in_loop[p->loop_stack_depth]++;
1173    return insn;
1174 }
1175
1176 /* This function is only used for gen6-style IF instructions with an
1177  * embedded comparison (conditional modifier).  It is not used on gen7.
1178  */
1179 struct brw_instruction *
1180 gen6_IF(struct brw_compile *p, uint32_t conditional,
1181         struct brw_reg src0, struct brw_reg src1)
1182 {
1183    struct brw_instruction *insn;
1184
1185    insn = next_insn(p, BRW_OPCODE_IF);
1186
1187    brw_set_dest(p, insn, brw_imm_w(0));
1188    if (p->compressed) {
1189       insn->header.execution_size = BRW_EXECUTE_16;
1190    } else {
1191       insn->header.execution_size = BRW_EXECUTE_8;
1192    }
1193    insn->bits1.branch_gen6.jump_count = 0;
1194    brw_set_src0(p, insn, src0);
1195    brw_set_src1(p, insn, src1);
1196
1197    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1198    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1199    insn->header.destreg__conditionalmod = conditional;
1200
1201    if (!p->single_program_flow)
1202       insn->header.thread_control = BRW_THREAD_SWITCH;
1203
1204    push_if_stack(p, insn);
1205    return insn;
1206 }
1207
1208 /**
1209  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1210  */
1211 static void
1212 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1213                        struct brw_instruction *if_inst,
1214                        struct brw_instruction *else_inst)
1215 {
1216    /* The next instruction (where the ENDIF would be, if it existed) */
1217    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1218
1219    assert(p->single_program_flow);
1220    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1221    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1222    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1223
1224    /* Convert IF to an ADD instruction that moves the instruction pointer
1225     * to the first instruction of the ELSE block.  If there is no ELSE
1226     * block, point to where ENDIF would be.  Reverse the predicate.
1227     *
1228     * There's no need to execute an ENDIF since we don't need to do any
1229     * stack operations, and if we're currently executing, we just want to
1230     * continue normally.
1231     */
1232    if_inst->header.opcode = BRW_OPCODE_ADD;
1233    if_inst->header.predicate_inverse = 1;
1234
1235    if (else_inst != NULL) {
1236       /* Convert ELSE to an ADD instruction that points where the ENDIF
1237        * would be.
1238        */
1239       else_inst->header.opcode = BRW_OPCODE_ADD;
1240
1241       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1242       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1243    } else {
1244       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1245    }
1246 }
1247
1248 /**
1249  * Patch IF and ELSE instructions with appropriate jump targets.
1250  */
1251 static void
1252 patch_IF_ELSE(struct brw_compile *p,
1253               struct brw_instruction *if_inst,
1254               struct brw_instruction *else_inst,
1255               struct brw_instruction *endif_inst)
1256 {
1257    struct brw_context *brw = p->brw;
1258
1259    /* We shouldn't be patching IF and ELSE instructions in single program flow
1260     * mode when gen < 6, because in single program flow mode on those
1261     * platforms, we convert flow control instructions to conditional ADDs that
1262     * operate on IP (see brw_ENDIF).
1263     *
1264     * However, on Gen6, writing to IP doesn't work in single program flow mode
1265     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1266     * not be updated by non-flow control instructions.").  And on later
1267     * platforms, there is no significant benefit to converting control flow
1268     * instructions to conditional ADDs.  So we do patch IF and ELSE
1269     * instructions in single program flow mode on those platforms.
1270     */
1271    if (brw->gen < 6)
1272       assert(!p->single_program_flow);
1273
1274    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1275    assert(endif_inst != NULL);
1276    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1277
1278    unsigned br = 1;
1279    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1280     * requires 2 chunks.
1281     */
1282    if (brw->gen >= 5)
1283       br = 2;
1284
1285    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1286    endif_inst->header.execution_size = if_inst->header.execution_size;
1287
1288    if (else_inst == NULL) {
1289       /* Patch IF -> ENDIF */
1290       if (brw->gen < 6) {
1291          /* Turn it into an IFF, which means no mask stack operations for
1292           * all-false and jumping past the ENDIF.
1293           */
1294          if_inst->header.opcode = BRW_OPCODE_IFF;
1295          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1296          if_inst->bits3.if_else.pop_count = 0;
1297          if_inst->bits3.if_else.pad0 = 0;
1298       } else if (brw->gen == 6) {
1299          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1300          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1301       } else {
1302          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1303          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1304       }
1305    } else {
1306       else_inst->header.execution_size = if_inst->header.execution_size;
1307
1308       /* Patch IF -> ELSE */
1309       if (brw->gen < 6) {
1310          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1311          if_inst->bits3.if_else.pop_count = 0;
1312          if_inst->bits3.if_else.pad0 = 0;
1313       } else if (brw->gen == 6) {
1314          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1315       }
1316
1317       /* Patch ELSE -> ENDIF */
1318       if (brw->gen < 6) {
1319          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1320           * matching ENDIF.
1321           */
1322          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1323          else_inst->bits3.if_else.pop_count = 1;
1324          else_inst->bits3.if_else.pad0 = 0;
1325       } else if (brw->gen == 6) {
1326          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1327          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1328       } else {
1329          /* The IF instruction's JIP should point just past the ELSE */
1330          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1331          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1332          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1333          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1334       }
1335    }
1336 }
1337
1338 void
1339 brw_ELSE(struct brw_compile *p)
1340 {
1341    struct brw_context *brw = p->brw;
1342    struct brw_instruction *insn;
1343
1344    insn = next_insn(p, BRW_OPCODE_ELSE);
1345
1346    if (brw->gen < 6) {
1347       brw_set_dest(p, insn, brw_ip_reg());
1348       brw_set_src0(p, insn, brw_ip_reg());
1349       brw_set_src1(p, insn, brw_imm_d(0x0));
1350    } else if (brw->gen == 6) {
1351       brw_set_dest(p, insn, brw_imm_w(0));
1352       insn->bits1.branch_gen6.jump_count = 0;
1353       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1354       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1355    } else {
1356       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1357       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1358       brw_set_src1(p, insn, brw_imm_ud(0));
1359       insn->bits3.break_cont.jip = 0;
1360       insn->bits3.break_cont.uip = 0;
1361    }
1362
1363    insn->header.compression_control = BRW_COMPRESSION_NONE;
1364    insn->header.mask_control = BRW_MASK_ENABLE;
1365    if (!p->single_program_flow)
1366       insn->header.thread_control = BRW_THREAD_SWITCH;
1367
1368    push_if_stack(p, insn);
1369 }
1370
1371 void
1372 brw_ENDIF(struct brw_compile *p)
1373 {
1374    struct brw_context *brw = p->brw;
1375    struct brw_instruction *insn = NULL;
1376    struct brw_instruction *else_inst = NULL;
1377    struct brw_instruction *if_inst = NULL;
1378    struct brw_instruction *tmp;
1379    bool emit_endif = true;
1380
1381    /* In single program flow mode, we can express IF and ELSE instructions
1382     * equivalently as ADD instructions that operate on IP.  On platforms prior
1383     * to Gen6, flow control instructions cause an implied thread switch, so
1384     * this is a significant savings.
1385     *
1386     * However, on Gen6, writing to IP doesn't work in single program flow mode
1387     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1388     * not be updated by non-flow control instructions.").  And on later
1389     * platforms, there is no significant benefit to converting control flow
1390     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1391     * Gen5.
1392     */
1393    if (brw->gen < 6 && p->single_program_flow)
1394       emit_endif = false;
1395
1396    /*
1397     * A single next_insn() may change the base adress of instruction store
1398     * memory(p->store), so call it first before referencing the instruction
1399     * store pointer from an index
1400     */
1401    if (emit_endif)
1402       insn = next_insn(p, BRW_OPCODE_ENDIF);
1403
1404    /* Pop the IF and (optional) ELSE instructions from the stack */
1405    p->if_depth_in_loop[p->loop_stack_depth]--;
1406    tmp = pop_if_stack(p);
1407    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1408       else_inst = tmp;
1409       tmp = pop_if_stack(p);
1410    }
1411    if_inst = tmp;
1412
1413    if (!emit_endif) {
1414       /* ENDIF is useless; don't bother emitting it. */
1415       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1416       return;
1417    }
1418
1419    if (brw->gen < 6) {
1420       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1421       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1422       brw_set_src1(p, insn, brw_imm_d(0x0));
1423    } else if (brw->gen == 6) {
1424       brw_set_dest(p, insn, brw_imm_w(0));
1425       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1426       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1427    } else {
1428       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1429       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1430       brw_set_src1(p, insn, brw_imm_ud(0));
1431    }
1432
1433    insn->header.compression_control = BRW_COMPRESSION_NONE;
1434    insn->header.mask_control = BRW_MASK_ENABLE;
1435    insn->header.thread_control = BRW_THREAD_SWITCH;
1436
1437    /* Also pop item off the stack in the endif instruction: */
1438    if (brw->gen < 6) {
1439       insn->bits3.if_else.jump_count = 0;
1440       insn->bits3.if_else.pop_count = 1;
1441       insn->bits3.if_else.pad0 = 0;
1442    } else if (brw->gen == 6) {
1443       insn->bits1.branch_gen6.jump_count = 2;
1444    } else {
1445       insn->bits3.break_cont.jip = 2;
1446    }
1447    patch_IF_ELSE(p, if_inst, else_inst, insn);
1448 }
1449
1450 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1451 {
1452    struct brw_context *brw = p->brw;
1453    struct brw_instruction *insn;
1454
1455    insn = next_insn(p, BRW_OPCODE_BREAK);
1456    if (brw->gen >= 6) {
1457       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1458       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1459       brw_set_src1(p, insn, brw_imm_d(0x0));
1460    } else {
1461       brw_set_dest(p, insn, brw_ip_reg());
1462       brw_set_src0(p, insn, brw_ip_reg());
1463       brw_set_src1(p, insn, brw_imm_d(0x0));
1464       insn->bits3.if_else.pad0 = 0;
1465       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1466    }
1467    insn->header.compression_control = BRW_COMPRESSION_NONE;
1468    insn->header.execution_size = BRW_EXECUTE_8;
1469
1470    return insn;
1471 }
1472
1473 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1474 {
1475    struct brw_instruction *insn;
1476
1477    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1478    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1479    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1480    brw_set_dest(p, insn, brw_ip_reg());
1481    brw_set_src0(p, insn, brw_ip_reg());
1482    brw_set_src1(p, insn, brw_imm_d(0x0));
1483
1484    insn->header.compression_control = BRW_COMPRESSION_NONE;
1485    insn->header.execution_size = BRW_EXECUTE_8;
1486    return insn;
1487 }
1488
1489 struct brw_instruction *brw_CONT(struct brw_compile *p)
1490 {
1491    struct brw_instruction *insn;
1492    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1493    brw_set_dest(p, insn, brw_ip_reg());
1494    brw_set_src0(p, insn, brw_ip_reg());
1495    brw_set_src1(p, insn, brw_imm_d(0x0));
1496    insn->header.compression_control = BRW_COMPRESSION_NONE;
1497    insn->header.execution_size = BRW_EXECUTE_8;
1498    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1499    insn->bits3.if_else.pad0 = 0;
1500    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1501    return insn;
1502 }
1503
1504 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1505 {
1506    struct brw_instruction *insn;
1507
1508    insn = next_insn(p, BRW_OPCODE_HALT);
1509    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1510    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1511    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1512
1513    if (p->compressed) {
1514       insn->header.execution_size = BRW_EXECUTE_16;
1515    } else {
1516       insn->header.compression_control = BRW_COMPRESSION_NONE;
1517       insn->header.execution_size = BRW_EXECUTE_8;
1518    }
1519    return insn;
1520 }
1521
1522 /* DO/WHILE loop:
1523  *
1524  * The DO/WHILE is just an unterminated loop -- break or continue are
1525  * used for control within the loop.  We have a few ways they can be
1526  * done.
1527  *
1528  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1529  * jip and no DO instruction.
1530  *
1531  * For non-uniform control flow pre-gen6, there's a DO instruction to
1532  * push the mask, and a WHILE to jump back, and BREAK to get out and
1533  * pop the mask.
1534  *
1535  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1536  * just points back to the first instruction of the loop.
1537  */
1538 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1539 {
1540    struct brw_context *brw = p->brw;
1541
1542    if (brw->gen >= 6 || p->single_program_flow) {
1543       push_loop_stack(p, &p->store[p->nr_insn]);
1544       return &p->store[p->nr_insn];
1545    } else {
1546       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1547
1548       push_loop_stack(p, insn);
1549
1550       /* Override the defaults for this instruction:
1551        */
1552       brw_set_dest(p, insn, brw_null_reg());
1553       brw_set_src0(p, insn, brw_null_reg());
1554       brw_set_src1(p, insn, brw_null_reg());
1555
1556       insn->header.compression_control = BRW_COMPRESSION_NONE;
1557       insn->header.execution_size = execute_size;
1558       insn->header.predicate_control = BRW_PREDICATE_NONE;
1559       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1560       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1561
1562       return insn;
1563    }
1564 }
1565
1566 /**
1567  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1568  * instruction here.
1569  *
1570  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1571  * nesting, since it can always just point to the end of the block/current loop.
1572  */
1573 static void
1574 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1575 {
1576    struct brw_context *brw = p->brw;
1577    struct brw_instruction *do_inst = get_inner_do_insn(p);
1578    struct brw_instruction *inst;
1579    int br = (brw->gen == 5) ? 2 : 1;
1580
1581    for (inst = while_inst - 1; inst != do_inst; inst--) {
1582       /* If the jump count is != 0, that means that this instruction has already
1583        * been patched because it's part of a loop inside of the one we're
1584        * patching.
1585        */
1586       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1587           inst->bits3.if_else.jump_count == 0) {
1588          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1589       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1590                  inst->bits3.if_else.jump_count == 0) {
1591          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1592       }
1593    }
1594 }
1595
1596 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1597 {
1598    struct brw_context *brw = p->brw;
1599    struct brw_instruction *insn, *do_insn;
1600    GLuint br = 1;
1601
1602    if (brw->gen >= 5)
1603       br = 2;
1604
1605    if (brw->gen >= 7) {
1606       insn = next_insn(p, BRW_OPCODE_WHILE);
1607       do_insn = get_inner_do_insn(p);
1608
1609       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1610       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611       brw_set_src1(p, insn, brw_imm_ud(0));
1612       insn->bits3.break_cont.jip = br * (do_insn - insn);
1613
1614       insn->header.execution_size = BRW_EXECUTE_8;
1615    } else if (brw->gen == 6) {
1616       insn = next_insn(p, BRW_OPCODE_WHILE);
1617       do_insn = get_inner_do_insn(p);
1618
1619       brw_set_dest(p, insn, brw_imm_w(0));
1620       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1621       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1622       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1623
1624       insn->header.execution_size = BRW_EXECUTE_8;
1625    } else {
1626       if (p->single_program_flow) {
1627          insn = next_insn(p, BRW_OPCODE_ADD);
1628          do_insn = get_inner_do_insn(p);
1629
1630          brw_set_dest(p, insn, brw_ip_reg());
1631          brw_set_src0(p, insn, brw_ip_reg());
1632          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1633          insn->header.execution_size = BRW_EXECUTE_1;
1634       } else {
1635          insn = next_insn(p, BRW_OPCODE_WHILE);
1636          do_insn = get_inner_do_insn(p);
1637
1638          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1639
1640          brw_set_dest(p, insn, brw_ip_reg());
1641          brw_set_src0(p, insn, brw_ip_reg());
1642          brw_set_src1(p, insn, brw_imm_d(0));
1643
1644          insn->header.execution_size = do_insn->header.execution_size;
1645          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1646          insn->bits3.if_else.pop_count = 0;
1647          insn->bits3.if_else.pad0 = 0;
1648
1649          brw_patch_break_cont(p, insn);
1650       }
1651    }
1652    insn->header.compression_control = BRW_COMPRESSION_NONE;
1653    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1654
1655    p->loop_stack_depth--;
1656
1657    return insn;
1658 }
1659
1660
1661 /* FORWARD JUMPS:
1662  */
1663 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1664 {
1665    struct brw_context *brw = p->brw;
1666    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1667    GLuint jmpi = 1;
1668
1669    if (brw->gen >= 5)
1670       jmpi = 2;
1671
1672    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1673    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1674
1675    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1676 }
1677
1678
1679
1680 /* To integrate with the above, it makes sense that the comparison
1681  * instruction should populate the flag register.  It might be simpler
1682  * just to use the flag reg for most WM tasks?
1683  */
1684 void brw_CMP(struct brw_compile *p,
1685              struct brw_reg dest,
1686              GLuint conditional,
1687              struct brw_reg src0,
1688              struct brw_reg src1)
1689 {
1690    struct brw_context *brw = p->brw;
1691    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1692
1693    insn->header.destreg__conditionalmod = conditional;
1694    brw_set_dest(p, insn, dest);
1695    brw_set_src0(p, insn, src0);
1696    brw_set_src1(p, insn, src1);
1697
1698 /*    guess_execution_size(insn, src0); */
1699
1700
1701    /* Make it so that future instructions will use the computed flag
1702     * value until brw_set_predicate_control_flag_value() is called
1703     * again.
1704     */
1705    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1706        dest.nr == 0) {
1707       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1708       p->flag_value = 0xff;
1709    }
1710
1711    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1712     * page says:
1713     *    "Any CMP instruction with a null destination must use a {switch}."
1714     *
1715     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1716     * mentioned on their work-arounds pages.
1717     */
1718    if (brw->gen == 7) {
1719       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1720           dest.nr == BRW_ARF_NULL) {
1721          insn->header.thread_control = BRW_THREAD_SWITCH;
1722       }
1723    }
1724 }
1725
1726 /* Issue 'wait' instruction for n1, host could program MMIO
1727    to wake up thread. */
1728 void brw_WAIT (struct brw_compile *p)
1729 {
1730    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1731    struct brw_reg src = brw_notification_1_reg();
1732
1733    brw_set_dest(p, insn, src);
1734    brw_set_src0(p, insn, src);
1735    brw_set_src1(p, insn, brw_null_reg());
1736    insn->header.execution_size = 0; /* must */
1737    insn->header.predicate_control = 0;
1738    insn->header.compression_control = 0;
1739 }
1740
1741
1742 /***********************************************************************
1743  * Helpers for the various SEND message types:
1744  */
1745
1746 /** Extended math function, float[8].
1747  */
1748 void brw_math( struct brw_compile *p,
1749                struct brw_reg dest,
1750                GLuint function,
1751                GLuint msg_reg_nr,
1752                struct brw_reg src,
1753                GLuint data_type,
1754                GLuint precision )
1755 {
1756    struct brw_context *brw = p->brw;
1757
1758    if (brw->gen >= 6) {
1759       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1760
1761       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1762              (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1763       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1764
1765       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1766       if (brw->gen == 6)
1767          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1768
1769       /* Source modifiers are ignored for extended math instructions on Gen6. */
1770       if (brw->gen == 6) {
1771          assert(!src.negate);
1772          assert(!src.abs);
1773       }
1774
1775       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1776           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1777           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1778          assert(src.type != BRW_REGISTER_TYPE_F);
1779       } else {
1780          assert(src.type == BRW_REGISTER_TYPE_F);
1781       }
1782
1783       /* Math is the same ISA format as other opcodes, except that CondModifier
1784        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1785        */
1786       insn->header.destreg__conditionalmod = function;
1787
1788       brw_set_dest(p, insn, dest);
1789       brw_set_src0(p, insn, src);
1790       brw_set_src1(p, insn, brw_null_reg());
1791    } else {
1792       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1793
1794       /* Example code doesn't set predicate_control for send
1795        * instructions.
1796        */
1797       insn->header.predicate_control = 0;
1798       insn->header.destreg__conditionalmod = msg_reg_nr;
1799
1800       brw_set_dest(p, insn, dest);
1801       brw_set_src0(p, insn, src);
1802       brw_set_math_message(p,
1803                            insn,
1804                            function,
1805                            src.type == BRW_REGISTER_TYPE_D,
1806                            precision,
1807                            data_type);
1808    }
1809 }
1810
1811 /** Extended math function, float[8].
1812  */
1813 void brw_math2(struct brw_compile *p,
1814                struct brw_reg dest,
1815                GLuint function,
1816                struct brw_reg src0,
1817                struct brw_reg src1)
1818 {
1819    struct brw_context *brw = p->brw;
1820    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1821
1822    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1823           (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1824    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1825    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1826
1827    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1828    if (brw->gen == 6) {
1829       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1830       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1831    }
1832
1833    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1834        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1835        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1836       assert(src0.type != BRW_REGISTER_TYPE_F);
1837       assert(src1.type != BRW_REGISTER_TYPE_F);
1838    } else {
1839       assert(src0.type == BRW_REGISTER_TYPE_F);
1840       assert(src1.type == BRW_REGISTER_TYPE_F);
1841    }
1842
1843    /* Source modifiers are ignored for extended math instructions on Gen6. */
1844    if (brw->gen == 6) {
1845       assert(!src0.negate);
1846       assert(!src0.abs);
1847       assert(!src1.negate);
1848       assert(!src1.abs);
1849    }
1850
1851    /* Math is the same ISA format as other opcodes, except that CondModifier
1852     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1853     */
1854    insn->header.destreg__conditionalmod = function;
1855
1856    brw_set_dest(p, insn, dest);
1857    brw_set_src0(p, insn, src0);
1858    brw_set_src1(p, insn, src1);
1859 }
1860
1861
1862 /**
1863  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1864  * using a constant offset per channel.
1865  *
1866  * The offset must be aligned to oword size (16 bytes).  Used for
1867  * register spilling.
1868  */
1869 void brw_oword_block_write_scratch(struct brw_compile *p,
1870                                    struct brw_reg mrf,
1871                                    int num_regs,
1872                                    GLuint offset)
1873 {
1874    struct brw_context *brw = p->brw;
1875    uint32_t msg_control, msg_type;
1876    int mlen;
1877
1878    if (brw->gen >= 6)
1879       offset /= 16;
1880
1881    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1882
1883    if (num_regs == 1) {
1884       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1885       mlen = 2;
1886    } else {
1887       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1888       mlen = 3;
1889    }
1890
1891    /* Set up the message header.  This is g0, with g0.2 filled with
1892     * the offset.  We don't want to leave our offset around in g0 or
1893     * it'll screw up texture samples, so set it up inside the message
1894     * reg.
1895     */
1896    {
1897       brw_push_insn_state(p);
1898       brw_set_mask_control(p, BRW_MASK_DISABLE);
1899       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1900
1901       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1902
1903       /* set message header global offset field (reg 0, element 2) */
1904       brw_MOV(p,
1905               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1906                                   mrf.nr,
1907                                   2), BRW_REGISTER_TYPE_UD),
1908               brw_imm_ud(offset));
1909
1910       brw_pop_insn_state(p);
1911    }
1912
1913    {
1914       struct brw_reg dest;
1915       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1916       int send_commit_msg;
1917       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1918                                          BRW_REGISTER_TYPE_UW);
1919
1920       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1921          insn->header.compression_control = BRW_COMPRESSION_NONE;
1922          src_header = vec16(src_header);
1923       }
1924       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1925       insn->header.destreg__conditionalmod = mrf.nr;
1926
1927       /* Until gen6, writes followed by reads from the same location
1928        * are not guaranteed to be ordered unless write_commit is set.
1929        * If set, then a no-op write is issued to the destination
1930        * register to set a dependency, and a read from the destination
1931        * can be used to ensure the ordering.
1932        *
1933        * For gen6, only writes between different threads need ordering
1934        * protection.  Our use of DP writes is all about register
1935        * spilling within a thread.
1936        */
1937       if (brw->gen >= 6) {
1938          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1939          send_commit_msg = 0;
1940       } else {
1941          dest = src_header;
1942          send_commit_msg = 1;
1943       }
1944
1945       brw_set_dest(p, insn, dest);
1946       if (brw->gen >= 6) {
1947          brw_set_src0(p, insn, mrf);
1948       } else {
1949          brw_set_src0(p, insn, brw_null_reg());
1950       }
1951
1952       if (brw->gen >= 6)
1953          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1954       else
1955          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1956
1957       brw_set_dp_write_message(p,
1958                                insn,
1959                                255, /* binding table index (255=stateless) */
1960                                msg_control,
1961                                msg_type,
1962                                mlen,
1963                                true, /* header_present */
1964                                0, /* not a render target */
1965                                send_commit_msg, /* response_length */
1966                                0, /* eot */
1967                                send_commit_msg);
1968    }
1969 }
1970
1971
1972 /**
1973  * Read a block of owords (half a GRF each) from the scratch buffer
1974  * using a constant index per channel.
1975  *
1976  * Offset must be aligned to oword size (16 bytes).  Used for register
1977  * spilling.
1978  */
1979 void
1980 brw_oword_block_read_scratch(struct brw_compile *p,
1981                              struct brw_reg dest,
1982                              struct brw_reg mrf,
1983                              int num_regs,
1984                              GLuint offset)
1985 {
1986    struct brw_context *brw = p->brw;
1987    uint32_t msg_control;
1988    int rlen;
1989
1990    if (brw->gen >= 6)
1991       offset /= 16;
1992
1993    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1994    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1995
1996    if (num_regs == 1) {
1997       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1998       rlen = 1;
1999    } else {
2000       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2001       rlen = 2;
2002    }
2003
2004    {
2005       brw_push_insn_state(p);
2006       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2007       brw_set_mask_control(p, BRW_MASK_DISABLE);
2008
2009       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2010
2011       /* set message header global offset field (reg 0, element 2) */
2012       brw_MOV(p,
2013               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2014                                   mrf.nr,
2015                                   2), BRW_REGISTER_TYPE_UD),
2016               brw_imm_ud(offset));
2017
2018       brw_pop_insn_state(p);
2019    }
2020
2021    {
2022       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2023
2024       assert(insn->header.predicate_control == 0);
2025       insn->header.compression_control = BRW_COMPRESSION_NONE;
2026       insn->header.destreg__conditionalmod = mrf.nr;
2027
2028       brw_set_dest(p, insn, dest);      /* UW? */
2029       if (brw->gen >= 6) {
2030          brw_set_src0(p, insn, mrf);
2031       } else {
2032          brw_set_src0(p, insn, brw_null_reg());
2033       }
2034
2035       brw_set_dp_read_message(p,
2036                               insn,
2037                               255, /* binding table index (255=stateless) */
2038                               msg_control,
2039                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2040                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2041                               1, /* msg_length */
2042                               true, /* header_present */
2043                               rlen);
2044    }
2045 }
2046
2047 /**
2048  * Read a float[4] vector from the data port Data Cache (const buffer).
2049  * Location (in buffer) should be a multiple of 16.
2050  * Used for fetching shader constants.
2051  */
2052 void brw_oword_block_read(struct brw_compile *p,
2053                           struct brw_reg dest,
2054                           struct brw_reg mrf,
2055                           uint32_t offset,
2056                           uint32_t bind_table_index)
2057 {
2058    struct brw_context *brw = p->brw;
2059
2060    /* On newer hardware, offset is in units of owords. */
2061    if (brw->gen >= 6)
2062       offset /= 16;
2063
2064    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2065
2066    brw_push_insn_state(p);
2067    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2068    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2069    brw_set_mask_control(p, BRW_MASK_DISABLE);
2070
2071    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2072
2073    /* set message header global offset field (reg 0, element 2) */
2074    brw_MOV(p,
2075            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2076                                mrf.nr,
2077                                2), BRW_REGISTER_TYPE_UD),
2078            brw_imm_ud(offset));
2079
2080    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2081    insn->header.destreg__conditionalmod = mrf.nr;
2082
2083    /* cast dest to a uword[8] vector */
2084    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2085
2086    brw_set_dest(p, insn, dest);
2087    if (brw->gen >= 6) {
2088       brw_set_src0(p, insn, mrf);
2089    } else {
2090       brw_set_src0(p, insn, brw_null_reg());
2091    }
2092
2093    brw_set_dp_read_message(p,
2094                            insn,
2095                            bind_table_index,
2096                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2097                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2098                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2099                            1, /* msg_length */
2100                            true, /* header_present */
2101                            1); /* response_length (1 reg, 2 owords!) */
2102
2103    brw_pop_insn_state(p);
2104 }
2105
2106
2107 void brw_fb_WRITE(struct brw_compile *p,
2108                   int dispatch_width,
2109                   GLuint msg_reg_nr,
2110                   struct brw_reg src0,
2111                   GLuint msg_control,
2112                   GLuint binding_table_index,
2113                   GLuint msg_length,
2114                   GLuint response_length,
2115                   bool eot,
2116                   bool header_present)
2117 {
2118    struct brw_context *brw = p->brw;
2119    struct brw_instruction *insn;
2120    GLuint msg_type;
2121    struct brw_reg dest;
2122
2123    if (dispatch_width == 16)
2124       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2125    else
2126       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2127
2128    if (brw->gen >= 6) {
2129       insn = next_insn(p, BRW_OPCODE_SENDC);
2130    } else {
2131       insn = next_insn(p, BRW_OPCODE_SEND);
2132    }
2133    /* The execution mask is ignored for render target writes. */
2134    insn->header.predicate_control = 0;
2135    insn->header.compression_control = BRW_COMPRESSION_NONE;
2136
2137    if (brw->gen >= 6) {
2138       /* headerless version, just submit color payload */
2139       src0 = brw_message_reg(msg_reg_nr);
2140
2141       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2142    } else {
2143       insn->header.destreg__conditionalmod = msg_reg_nr;
2144
2145       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2146    }
2147
2148    brw_set_dest(p, insn, dest);
2149    brw_set_src0(p, insn, src0);
2150    brw_set_dp_write_message(p,
2151                             insn,
2152                             binding_table_index,
2153                             msg_control,
2154                             msg_type,
2155                             msg_length,
2156                             header_present,
2157                             eot, /* last render target write */
2158                             response_length,
2159                             eot,
2160                             0 /* send_commit_msg */);
2161 }
2162
2163
2164 /**
2165  * Texture sample instruction.
2166  * Note: the msg_type plus msg_length values determine exactly what kind
2167  * of sampling operation is performed.  See volume 4, page 161 of docs.
2168  */
2169 void brw_SAMPLE(struct brw_compile *p,
2170                 struct brw_reg dest,
2171                 GLuint msg_reg_nr,
2172                 struct brw_reg src0,
2173                 GLuint binding_table_index,
2174                 GLuint sampler,
2175                 GLuint msg_type,
2176                 GLuint response_length,
2177                 GLuint msg_length,
2178                 GLuint header_present,
2179                 GLuint simd_mode,
2180                 GLuint return_format)
2181 {
2182    struct brw_context *brw = p->brw;
2183    struct brw_instruction *insn;
2184
2185    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2186
2187    insn = next_insn(p, BRW_OPCODE_SEND);
2188    insn->header.predicate_control = 0; /* XXX */
2189    insn->header.compression_control = BRW_COMPRESSION_NONE;
2190    if (brw->gen < 6)
2191       insn->header.destreg__conditionalmod = msg_reg_nr;
2192
2193    brw_set_dest(p, insn, dest);
2194    brw_set_src0(p, insn, src0);
2195    brw_set_sampler_message(p, insn,
2196                            binding_table_index,
2197                            sampler,
2198                            msg_type,
2199                            response_length,
2200                            msg_length,
2201                            header_present,
2202                            simd_mode,
2203                            return_format);
2204 }
2205
2206 /* All these variables are pretty confusing - we might be better off
2207  * using bitmasks and macros for this, in the old style.  Or perhaps
2208  * just having the caller instantiate the fields in dword3 itself.
2209  */
2210 void brw_urb_WRITE(struct brw_compile *p,
2211                    struct brw_reg dest,
2212                    GLuint msg_reg_nr,
2213                    struct brw_reg src0,
2214                    enum brw_urb_write_flags flags,
2215                    GLuint msg_length,
2216                    GLuint response_length,
2217                    GLuint offset,
2218                    GLuint swizzle)
2219 {
2220    struct brw_context *brw = p->brw;
2221    struct brw_instruction *insn;
2222
2223    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2224
2225    if (brw->gen == 7) {
2226       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2227       brw_push_insn_state(p);
2228       brw_set_access_mode(p, BRW_ALIGN_1);
2229       brw_set_mask_control(p, BRW_MASK_DISABLE);
2230       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2231                        BRW_REGISTER_TYPE_UD),
2232                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2233                 brw_imm_ud(0xff00));
2234       brw_pop_insn_state(p);
2235    }
2236
2237    insn = next_insn(p, BRW_OPCODE_SEND);
2238
2239    assert(msg_length < BRW_MAX_MRF);
2240
2241    brw_set_dest(p, insn, dest);
2242    brw_set_src0(p, insn, src0);
2243    brw_set_src1(p, insn, brw_imm_d(0));
2244
2245    if (brw->gen < 6)
2246       insn->header.destreg__conditionalmod = msg_reg_nr;
2247
2248    brw_set_urb_message(p,
2249                        insn,
2250                        flags,
2251                        msg_length,
2252                        response_length,
2253                        offset,
2254                        swizzle);
2255 }
2256
2257 static int
2258 next_ip(struct brw_compile *p, int ip)
2259 {
2260    struct brw_instruction *insn = (void *)p->store + ip;
2261
2262    if (insn->header.cmpt_control)
2263       return ip + 8;
2264    else
2265       return ip + 16;
2266 }
2267
2268 static int
2269 brw_find_next_block_end(struct brw_compile *p, int start)
2270 {
2271    int ip;
2272    void *store = p->store;
2273
2274    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2275       struct brw_instruction *insn = store + ip;
2276
2277       switch (insn->header.opcode) {
2278       case BRW_OPCODE_ENDIF:
2279       case BRW_OPCODE_ELSE:
2280       case BRW_OPCODE_WHILE:
2281       case BRW_OPCODE_HALT:
2282          return ip;
2283       }
2284    }
2285
2286    return 0;
2287 }
2288
2289 /* There is no DO instruction on gen6, so to find the end of the loop
2290  * we have to see if the loop is jumping back before our start
2291  * instruction.
2292  */
2293 static int
2294 brw_find_loop_end(struct brw_compile *p, int start)
2295 {
2296    struct brw_context *brw = p->brw;
2297    int ip;
2298    int scale = 8;
2299    void *store = p->store;
2300
2301    /* Always start after the instruction (such as a WHILE) we're trying to fix
2302     * up.
2303     */
2304    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2305       struct brw_instruction *insn = store + ip;
2306
2307       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2308          int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2309                                    : insn->bits3.break_cont.jip;
2310          if (ip + jip * scale <= start)
2311             return ip;
2312       }
2313    }
2314    assert(!"not reached");
2315    return start;
2316 }
2317
2318 /* After program generation, go back and update the UIP and JIP of
2319  * BREAK, CONT, and HALT instructions to their correct locations.
2320  */
2321 void
2322 brw_set_uip_jip(struct brw_compile *p)
2323 {
2324    struct brw_context *brw = p->brw;
2325    int ip;
2326    int scale = 8;
2327    void *store = p->store;
2328
2329    if (brw->gen < 6)
2330       return;
2331
2332    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2333       struct brw_instruction *insn = store + ip;
2334
2335       if (insn->header.cmpt_control) {
2336          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2337          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2338                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2339                 insn->header.opcode != BRW_OPCODE_HALT);
2340          continue;
2341       }
2342
2343       int block_end_ip = brw_find_next_block_end(p, ip);
2344       switch (insn->header.opcode) {
2345       case BRW_OPCODE_BREAK:
2346          assert(block_end_ip != 0);
2347          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2348          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2349          insn->bits3.break_cont.uip =
2350             (brw_find_loop_end(p, ip) - ip +
2351              (brw->gen == 6 ? 16 : 0)) / scale;
2352          break;
2353       case BRW_OPCODE_CONTINUE:
2354          assert(block_end_ip != 0);
2355          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2356          insn->bits3.break_cont.uip =
2357             (brw_find_loop_end(p, ip) - ip) / scale;
2358
2359          assert(insn->bits3.break_cont.uip != 0);
2360          assert(insn->bits3.break_cont.jip != 0);
2361          break;
2362
2363       case BRW_OPCODE_ENDIF:
2364          if (block_end_ip == 0)
2365             insn->bits3.break_cont.jip = 2;
2366          else
2367             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2368          break;
2369
2370       case BRW_OPCODE_HALT:
2371          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2372           *
2373           *    "In case of the halt instruction not inside any conditional
2374           *     code block, the value of <JIP> and <UIP> should be the
2375           *     same. In case of the halt instruction inside conditional code
2376           *     block, the <UIP> should be the end of the program, and the
2377           *     <JIP> should be end of the most inner conditional code block."
2378           *
2379           * The uip will have already been set by whoever set up the
2380           * instruction.
2381           */
2382          if (block_end_ip == 0) {
2383             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2384          } else {
2385             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2386          }
2387          assert(insn->bits3.break_cont.uip != 0);
2388          assert(insn->bits3.break_cont.jip != 0);
2389          break;
2390       }
2391    }
2392 }
2393
2394 void brw_ff_sync(struct brw_compile *p,
2395                    struct brw_reg dest,
2396                    GLuint msg_reg_nr,
2397                    struct brw_reg src0,
2398                    bool allocate,
2399                    GLuint response_length,
2400                    bool eot)
2401 {
2402    struct brw_context *brw = p->brw;
2403    struct brw_instruction *insn;
2404
2405    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2406
2407    insn = next_insn(p, BRW_OPCODE_SEND);
2408    brw_set_dest(p, insn, dest);
2409    brw_set_src0(p, insn, src0);
2410    brw_set_src1(p, insn, brw_imm_d(0));
2411
2412    if (brw->gen < 6)
2413       insn->header.destreg__conditionalmod = msg_reg_nr;
2414
2415    brw_set_ff_sync_message(p,
2416                            insn,
2417                            allocate,
2418                            response_length,
2419                            eot);
2420 }
2421
2422 /**
2423  * Emit the SEND instruction necessary to generate stream output data on Gen6
2424  * (for transform feedback).
2425  *
2426  * If send_commit_msg is true, this is the last piece of stream output data
2427  * from this thread, so send the data as a committed write.  According to the
2428  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2429  *
2430  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2431  *   writes are complete by sending the final write as a committed write."
2432  */
2433 void
2434 brw_svb_write(struct brw_compile *p,
2435               struct brw_reg dest,
2436               GLuint msg_reg_nr,
2437               struct brw_reg src0,
2438               GLuint binding_table_index,
2439               bool   send_commit_msg)
2440 {
2441    struct brw_instruction *insn;
2442
2443    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2444
2445    insn = next_insn(p, BRW_OPCODE_SEND);
2446    brw_set_dest(p, insn, dest);
2447    brw_set_src0(p, insn, src0);
2448    brw_set_src1(p, insn, brw_imm_d(0));
2449    brw_set_dp_write_message(p, insn,
2450                             binding_table_index,
2451                             0, /* msg_control: ignored */
2452                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2453                             1, /* msg_length */
2454                             true, /* header_present */
2455                             0, /* last_render_target: ignored */
2456                             send_commit_msg, /* response_length */
2457                             0, /* end_of_thread */
2458                             send_commit_msg); /* send_commit_msg */
2459 }
2460
2461 /**
2462  * This instruction is generated as a single-channel align1 instruction by
2463  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2464  *
2465  * We can't use the typed atomic op in the FS because that has the execution
2466  * mask ANDed with the pixel mask, but we just want to write the one dword for
2467  * all the pixels.
2468  *
2469  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2470  * one u32.  So we use the same untyped atomic write message as the pixel
2471  * shader.
2472  *
2473  * The untyped atomic operation requires a BUFFER surface type with RAW
2474  * format, and is only accessible through the legacy DATA_CACHE dataport
2475  * messages.
2476  */
2477 void brw_shader_time_add(struct brw_compile *p,
2478                          struct brw_reg payload,
2479                          uint32_t surf_index)
2480 {
2481    struct brw_context *brw = p->brw;
2482    assert(brw->gen >= 7);
2483
2484    brw_push_insn_state(p);
2485    brw_set_access_mode(p, BRW_ALIGN_1);
2486    brw_set_mask_control(p, BRW_MASK_DISABLE);
2487    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2488    brw_pop_insn_state(p);
2489
2490    /* We use brw_vec1_reg and unmasked because we want to increment the given
2491     * offset only once.
2492     */
2493    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2494                                       BRW_ARF_NULL, 0));
2495    brw_set_src0(p, send, brw_vec1_reg(payload.file,
2496                                       payload.nr, 0));
2497
2498    uint32_t sfid, msg_type;
2499    if (brw->is_haswell) {
2500       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2501       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2502    } else {
2503       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2504       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2505    }
2506
2507    bool header_present = false;
2508    bool eot = false;
2509    uint32_t mlen = 2; /* offset, value */
2510    uint32_t rlen = 0;
2511    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2512
2513    send->bits3.ud |= msg_type << 14;
2514    send->bits3.ud |= 0 << 13; /* no return data */
2515    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2516    send->bits3.ud |= BRW_AOP_ADD << 8;
2517    send->bits3.ud |= surf_index << 0;
2518 }