src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct brw_context *brw = p->brw;
  67    if (brw->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct brw_context *brw = p->brw;
  96    if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 131           *    this to be programmed as "01".
 132           */
 133          insn->bits1.da16.dest_horiz_stride = 1;
 134       }
 135    }
 136    else {
 137       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 138
 139       /* These are different sizes in align1 vs align16:
 140        */
 141       if (insn->header.access_mode == BRW_ALIGN_1) {
 142          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 143          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 144             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 145          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 146       }
 147       else {
 148          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 149          /* even ignored in da16, still need to set as '01' */
 150          insn->bits1.ia16.dest_horiz_stride = 1;
 151       }
 152    }
 153
 154    /* NEW: Set the execution size based on dest.width and
 155     * insn->compression_control:
 156     */
 157    guess_execution_size(p, insn, dest);
 158 }
 159
 160 extern int reg_type_size[];
 161
 162 static void
 163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 164 {
 165    int hstride_for_reg[] = {0, 1, 2, 4};
 166    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 167    int width_for_reg[] = {1, 2, 4, 8, 16};
 168    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 169    int width, hstride, vstride, execsize;
 170
 171    if (reg.file == BRW_IMMEDIATE_VALUE) {
 172       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 173        * mean the destination has to be 128-bit aligned and the
 174        * destination horiz stride has to be a word.
 175        */
 176       if (reg.type == BRW_REGISTER_TYPE_V) {
 177          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 178                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 179       }
 180
 181       return;
 182    }
 183
 184    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 185        reg.file == BRW_ARF_NULL)
 186       return;
 187
 188    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 189    hstride = hstride_for_reg[reg.hstride];
 190
 191    if (reg.vstride == 0xf) {
 192       vstride = -1;
 193    } else {
 194       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 195       vstride = vstride_for_reg[reg.vstride];
 196    }
 197
 198    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 199    width = width_for_reg[reg.width];
 200
 201    assert(insn->header.execution_size >= 0 &&
 202           insn->header.execution_size < Elements(execsize_for_reg));
 203    execsize = execsize_for_reg[insn->header.execution_size];
 204
 205    /* Restrictions from 3.3.10: Register Region Restrictions. */
 206    /* 3. */
 207    assert(execsize >= width);
 208
 209    /* 4. */
 210    if (execsize == width && hstride != 0) {
 211       assert(vstride == -1 || vstride == width * hstride);
 212    }
 213
 214    /* 5. */
 215    if (execsize == width && hstride == 0) {
 216       /* no restriction on vstride. */
 217    }
 218
 219    /* 6. */
 220    if (width == 1) {
 221       assert(hstride == 0);
 222    }
 223
 224    /* 7. */
 225    if (execsize == 1 && width == 1) {
 226       assert(hstride == 0);
 227       assert(vstride == 0);
 228    }
 229
 230    /* 8. */
 231    if (vstride == 0 && hstride == 0) {
 232       assert(width == 1);
 233    }
 234
 235    /* 10. Check destination issues. */
 236 }
 237
 238 void
 239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 240              struct brw_reg reg)
 241 {
 242    struct brw_context *brw = p->brw;
 243
 244    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 245       assert(reg.nr < 128);
 246
 247    gen7_convert_mrf_to_grf(p, &reg);
 248
 249    if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 250                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 251       /* Any source modifiers or regions will be ignored, since this just
 252        * identifies the MRF/GRF to start reading the message contents from.
 253        * Check for some likely failures.
 254        */
 255       assert(!reg.negate);
 256       assert(!reg.abs);
 257       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 258    }
 259
 260    validate_reg(insn, reg);
 261
 262    insn->bits1.da1.src0_reg_file = reg.file;
 263    insn->bits1.da1.src0_reg_type = reg.type;
 264    insn->bits2.da1.src0_abs = reg.abs;
 265    insn->bits2.da1.src0_negate = reg.negate;
 266    insn->bits2.da1.src0_address_mode = reg.address_mode;
 267
 268    if (reg.file == BRW_IMMEDIATE_VALUE) {
 269       insn->bits3.ud = reg.dw1.ud;
 270
 271       /* Required to set some fields in src1 as well:
 272        */
 273       insn->bits1.da1.src1_reg_file = 0; /* arf */
 274       insn->bits1.da1.src1_reg_type = reg.type;
 275    }
 276    else
 277    {
 278       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 279          if (insn->header.access_mode == BRW_ALIGN_1) {
 280             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 281             insn->bits2.da1.src0_reg_nr = reg.nr;
 282          }
 283          else {
 284             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 285             insn->bits2.da16.src0_reg_nr = reg.nr;
 286          }
 287       }
 288       else {
 289          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 290
 291          if (insn->header.access_mode == BRW_ALIGN_1) {
 292             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 293          }
 294          else {
 295             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 296          }
 297       }
 298
 299       if (insn->header.access_mode == BRW_ALIGN_1) {
 300          if (reg.width == BRW_WIDTH_1 &&
 301              insn->header.execution_size == BRW_EXECUTE_1) {
 302             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 303             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 304             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 305          }
 306          else {
 307             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 308             insn->bits2.da1.src0_width = reg.width;
 309             insn->bits2.da1.src0_vert_stride = reg.vstride;
 310          }
 311       }
 312       else {
 313          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 314          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 315          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 316          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 317
 318          /* This is an oddity of the fact we're using the same
 319           * descriptions for registers in align_16 as align_1:
 320           */
 321          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 322             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 323          else
 324             insn->bits2.da16.src0_vert_stride = reg.vstride;
 325       }
 326    }
 327 }
 328
 329
 330 void brw_set_src1(struct brw_compile *p,
 331                   struct brw_instruction *insn,
 332                   struct brw_reg reg)
 333 {
 334    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 335
 336    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 337       assert(reg.nr < 128);
 338
 339    gen7_convert_mrf_to_grf(p, &reg);
 340
 341    validate_reg(insn, reg);
 342
 343    insn->bits1.da1.src1_reg_file = reg.file;
 344    insn->bits1.da1.src1_reg_type = reg.type;
 345    insn->bits3.da1.src1_abs = reg.abs;
 346    insn->bits3.da1.src1_negate = reg.negate;
 347
 348    /* Only src1 can be immediate in two-argument instructions.
 349     */
 350    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 351
 352    if (reg.file == BRW_IMMEDIATE_VALUE) {
 353       insn->bits3.ud = reg.dw1.ud;
 354    }
 355    else {
 356       /* This is a hardware restriction, which may or may not be lifted
 357        * in the future:
 358        */
 359       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 360       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 361
 362       if (insn->header.access_mode == BRW_ALIGN_1) {
 363          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 364          insn->bits3.da1.src1_reg_nr = reg.nr;
 365       }
 366       else {
 367          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 368          insn->bits3.da16.src1_reg_nr = reg.nr;
 369       }
 370
 371       if (insn->header.access_mode == BRW_ALIGN_1) {
 372          if (reg.width == BRW_WIDTH_1 &&
 373              insn->header.execution_size == BRW_EXECUTE_1) {
 374             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 375             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 376             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 377          }
 378          else {
 379             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 380             insn->bits3.da1.src1_width = reg.width;
 381             insn->bits3.da1.src1_vert_stride = reg.vstride;
 382          }
 383       }
 384       else {
 385          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 386          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 387          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 388          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 389
 390          /* This is an oddity of the fact we're using the same
 391           * descriptions for registers in align_16 as align_1:
 392           */
 393          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 394             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 395          else
 396             insn->bits3.da16.src1_vert_stride = reg.vstride;
 397       }
 398    }
 399 }
 400
 401 /**
 402  * Set the Message Descriptor and Extended Message Descriptor fields
 403  * for SEND messages.
 404  *
 405  * \note This zeroes out the Function Control bits, so it must be called
 406  *       \b before filling out any message-specific data.  Callers can
 407  *       choose not to fill in irrelevant bits; they will be zero.
 408  */
 409 static void
 410 brw_set_message_descriptor(struct brw_compile *p,
 411                            struct brw_instruction *inst,
 412                            enum brw_message_target sfid,
 413                            unsigned msg_length,
 414                            unsigned response_length,
 415                            bool header_present,
 416                            bool end_of_thread)
 417 {
 418    struct brw_context *brw = p->brw;
 419
 420    brw_set_src1(p, inst, brw_imm_d(0));
 421
 422    if (brw->gen >= 5) {
 423       inst->bits3.generic_gen5.header_present = header_present;
 424       inst->bits3.generic_gen5.response_length = response_length;
 425       inst->bits3.generic_gen5.msg_length = msg_length;
 426       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 427
 428       if (brw->gen >= 6) {
 429          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 430          inst->header.destreg__conditionalmod = sfid;
 431       } else {
 432          /* Set Extended Message Descriptor (ex_desc) */
 433          inst->bits2.send_gen5.sfid = sfid;
 434          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 435       }
 436    } else {
 437       inst->bits3.generic.response_length = response_length;
 438       inst->bits3.generic.msg_length = msg_length;
 439       inst->bits3.generic.msg_target = sfid;
 440       inst->bits3.generic.end_of_thread = end_of_thread;
 441    }
 442 }
 443
 444 static void brw_set_math_message( struct brw_compile *p,
 445                                   struct brw_instruction *insn,
 446                                   GLuint function,
 447                                   GLuint integer_type,
 448                                   bool low_precision,
 449                                   GLuint dataType )
 450 {
 451    struct brw_context *brw = p->brw;
 452    unsigned msg_length;
 453    unsigned response_length;
 454
 455    /* Infer message length from the function */
 456    switch (function) {
 457    case BRW_MATH_FUNCTION_POW:
 458    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 459    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 461       msg_length = 2;
 462       break;
 463    default:
 464       msg_length = 1;
 465       break;
 466    }
 467
 468    /* Infer response length from the function */
 469    switch (function) {
 470    case BRW_MATH_FUNCTION_SINCOS:
 471    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 472       response_length = 2;
 473       break;
 474    default:
 475       response_length = 1;
 476       break;
 477    }
 478
 479
 480    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 481                               msg_length, response_length, false, false);
 482    if (brw->gen == 5) {
 483       insn->bits3.math_gen5.function = function;
 484       insn->bits3.math_gen5.int_type = integer_type;
 485       insn->bits3.math_gen5.precision = low_precision;
 486       insn->bits3.math_gen5.saturate = insn->header.saturate;
 487       insn->bits3.math_gen5.data_type = dataType;
 488       insn->bits3.math_gen5.snapshot = 0;
 489    } else {
 490       insn->bits3.math.function = function;
 491       insn->bits3.math.int_type = integer_type;
 492       insn->bits3.math.precision = low_precision;
 493       insn->bits3.math.saturate = insn->header.saturate;
 494       insn->bits3.math.data_type = dataType;
 495    }
 496    insn->header.saturate = 0;
 497 }
 498
 499
 500 static void brw_set_ff_sync_message(struct brw_compile *p,
 501                                     struct brw_instruction *insn,
 502                                     bool allocate,
 503                                     GLuint response_length,
 504                                     bool end_of_thread)
 505 {
 506    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 507                               1, response_length, true, end_of_thread);
 508    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 509    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 510    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 511    insn->bits3.urb_gen5.allocate = allocate;
 512    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 514 }
 515
 516 static void brw_set_urb_message( struct brw_compile *p,
 517                                  struct brw_instruction *insn,
 518                                  bool allocate,
 519                                  bool used,
 520                                  GLuint msg_length,
 521                                  GLuint response_length,
 522                                  bool end_of_thread,
 523                                  bool complete,
 524                                  GLuint offset,
 525                                  GLuint swizzle_control )
 526 {
 527    struct brw_context *brw = p->brw;
 528
 529    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 530                               msg_length, response_length, true, end_of_thread);
 531    if (brw->gen == 7) {
 532       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 533       insn->bits3.urb_gen7.offset = offset;
 534       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 535       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 536       /* per_slot_offset = 0 makes it ignore offsets in message header */
 537       insn->bits3.urb_gen7.per_slot_offset = 0;
 538       insn->bits3.urb_gen7.complete = complete;
 539    } else if (brw->gen >= 5) {
 540       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 541       insn->bits3.urb_gen5.offset = offset;
 542       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 543       insn->bits3.urb_gen5.allocate = allocate;
 544       insn->bits3.urb_gen5.used = used; /* ? */
 545       insn->bits3.urb_gen5.complete = complete;
 546    } else {
 547       insn->bits3.urb.opcode = 0;       /* ? */
 548       insn->bits3.urb.offset = offset;
 549       insn->bits3.urb.swizzle_control = swizzle_control;
 550       insn->bits3.urb.allocate = allocate;
 551       insn->bits3.urb.used = used;      /* ? */
 552       insn->bits3.urb.complete = complete;
 553    }
 554 }
 555
 556 void
 557 brw_set_dp_write_message(struct brw_compile *p,
 558                          struct brw_instruction *insn,
 559                          GLuint binding_table_index,
 560                          GLuint msg_control,
 561                          GLuint msg_type,
 562                          GLuint msg_length,
 563                          bool header_present,
 564                          GLuint last_render_target,
 565                          GLuint response_length,
 566                          GLuint end_of_thread,
 567                          GLuint send_commit_msg)
 568 {
 569    struct brw_context *brw = p->brw;
 570    unsigned sfid;
 571
 572    if (brw->gen >= 7) {
 573       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 574       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 575          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 576       else
 577          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 578    } else if (brw->gen == 6) {
 579       /* Use the render cache for all write messages. */
 580       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 581    } else {
 582       sfid = BRW_SFID_DATAPORT_WRITE;
 583    }
 584
 585    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 586                               header_present, end_of_thread);
 587
 588    if (brw->gen >= 7) {
 589       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 590       insn->bits3.gen7_dp.msg_control = msg_control;
 591       insn->bits3.gen7_dp.last_render_target = last_render_target;
 592       insn->bits3.gen7_dp.msg_type = msg_type;
 593    } else if (brw->gen == 6) {
 594       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 595       insn->bits3.gen6_dp.msg_control = msg_control;
 596       insn->bits3.gen6_dp.last_render_target = last_render_target;
 597       insn->bits3.gen6_dp.msg_type = msg_type;
 598       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 599    } else if (brw->gen == 5) {
 600       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 601       insn->bits3.dp_write_gen5.msg_control = msg_control;
 602       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 603       insn->bits3.dp_write_gen5.msg_type = msg_type;
 604       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 605    } else {
 606       insn->bits3.dp_write.binding_table_index = binding_table_index;
 607       insn->bits3.dp_write.msg_control = msg_control;
 608       insn->bits3.dp_write.last_render_target = last_render_target;
 609       insn->bits3.dp_write.msg_type = msg_type;
 610       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 611    }
 612 }
 613
 614 void
 615 brw_set_dp_read_message(struct brw_compile *p,
 616                         struct brw_instruction *insn,
 617                         GLuint binding_table_index,
 618                         GLuint msg_control,
 619                         GLuint msg_type,
 620                         GLuint target_cache,
 621                         GLuint msg_length,
 622                         bool header_present,
 623                         GLuint response_length)
 624 {
 625    struct brw_context *brw = p->brw;
 626    unsigned sfid;
 627
 628    if (brw->gen >= 7) {
 629       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 630    } else if (brw->gen == 6) {
 631       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 632          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 633       else
 634          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 635    } else {
 636       sfid = BRW_SFID_DATAPORT_READ;
 637    }
 638
 639    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 640                               header_present, false);
 641
 642    if (brw->gen >= 7) {
 643       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 644       insn->bits3.gen7_dp.msg_control = msg_control;
 645       insn->bits3.gen7_dp.last_render_target = 0;
 646       insn->bits3.gen7_dp.msg_type = msg_type;
 647    } else if (brw->gen == 6) {
 648       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 649       insn->bits3.gen6_dp.msg_control = msg_control;
 650       insn->bits3.gen6_dp.last_render_target = 0;
 651       insn->bits3.gen6_dp.msg_type = msg_type;
 652       insn->bits3.gen6_dp.send_commit_msg = 0;
 653    } else if (brw->gen == 5) {
 654       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 655       insn->bits3.dp_read_gen5.msg_control = msg_control;
 656       insn->bits3.dp_read_gen5.msg_type = msg_type;
 657       insn->bits3.dp_read_gen5.target_cache = target_cache;
 658    } else if (brw->is_g4x) {
 659       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 660       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 661       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 662       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 663    } else {
 664       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 665       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 666       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 667       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 668    }
 669 }
 670
 671 void
 672 brw_set_sampler_message(struct brw_compile *p,
 673                         struct brw_instruction *insn,
 674                         GLuint binding_table_index,
 675                         GLuint sampler,
 676                         GLuint msg_type,
 677                         GLuint response_length,
 678                         GLuint msg_length,
 679                         GLuint header_present,
 680                         GLuint simd_mode,
 681                         GLuint return_format)
 682 {
 683    struct brw_context *brw = p->brw;
 684
 685    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 686                               response_length, header_present, false);
 687
 688    if (brw->gen >= 7) {
 689       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 690       insn->bits3.sampler_gen7.sampler = sampler;
 691       insn->bits3.sampler_gen7.msg_type = msg_type;
 692       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 693    } else if (brw->gen >= 5) {
 694       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 695       insn->bits3.sampler_gen5.sampler = sampler;
 696       insn->bits3.sampler_gen5.msg_type = msg_type;
 697       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 698    } else if (brw->is_g4x) {
 699       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 700       insn->bits3.sampler_g4x.sampler = sampler;
 701       insn->bits3.sampler_g4x.msg_type = msg_type;
 702    } else {
 703       insn->bits3.sampler.binding_table_index = binding_table_index;
 704       insn->bits3.sampler.sampler = sampler;
 705       insn->bits3.sampler.msg_type = msg_type;
 706       insn->bits3.sampler.return_format = return_format;
 707    }
 708 }
 709
 710
 711 #define next_insn brw_next_insn
 712 struct brw_instruction *
 713 brw_next_insn(struct brw_compile *p, GLuint opcode)
 714 {
 715    struct brw_instruction *insn;
 716
 717    if (p->nr_insn + 1 > p->store_size) {
 718       if (0)
 719          printf("incresing the store size to %d\n", p->store_size << 1);
 720       p->store_size <<= 1;
 721       p->store = reralloc(p->mem_ctx, p->store,
 722                           struct brw_instruction, p->store_size);
 723       if (!p->store)
 724          assert(!"realloc eu store memeory failed");
 725    }
 726
 727    p->next_insn_offset += 16;
 728    insn = &p->store[p->nr_insn++];
 729    memcpy(insn, p->current, sizeof(*insn));
 730
 731    /* Reset this one-shot flag:
 732     */
 733
 734    if (p->current->header.destreg__conditionalmod) {
 735       p->current->header.destreg__conditionalmod = 0;
 736       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 737    }
 738
 739    insn->header.opcode = opcode;
 740    return insn;
 741 }
 742
 743 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 744                                          GLuint opcode,
 745                                          struct brw_reg dest,
 746                                          struct brw_reg src )
 747 {
 748    struct brw_instruction *insn = next_insn(p, opcode);
 749    brw_set_dest(p, insn, dest);
 750    brw_set_src0(p, insn, src);
 751    return insn;
 752 }
 753
 754 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 755                                         GLuint opcode,
 756                                         struct brw_reg dest,
 757                                         struct brw_reg src0,
 758                                         struct brw_reg src1 )
 759 {
 760    struct brw_instruction *insn = next_insn(p, opcode);
 761    brw_set_dest(p, insn, dest);
 762    brw_set_src0(p, insn, src0);
 763    brw_set_src1(p, insn, src1);
 764    return insn;
 765 }
 766
 767 static int
 768 get_3src_subreg_nr(struct brw_reg reg)
 769 {
 770    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 771       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 772       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 773    } else {
 774       return reg.subnr / 4;
 775    }
 776 }
 777
 778 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 779                                         GLuint opcode,
 780                                         struct brw_reg dest,
 781                                         struct brw_reg src0,
 782                                         struct brw_reg src1,
 783                                         struct brw_reg src2)
 784 {
 785    struct brw_context *brw = p->brw;
 786    struct brw_instruction *insn = next_insn(p, opcode);
 787
 788    gen7_convert_mrf_to_grf(p, &dest);
 789
 790    assert(insn->header.access_mode == BRW_ALIGN_16);
 791
 792    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 793           dest.file == BRW_MESSAGE_REGISTER_FILE);
 794    assert(dest.nr < 128);
 795    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 796    assert(dest.type == BRW_REGISTER_TYPE_F ||
 797           dest.type == BRW_REGISTER_TYPE_D ||
 798           dest.type == BRW_REGISTER_TYPE_UD);
 799    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 800    insn->bits1.da3src.dest_reg_nr = dest.nr;
 801    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 802    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 803    guess_execution_size(p, insn, dest);
 804
 805    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 806    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 807    assert(src0.nr < 128);
 808    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 809    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 810    insn->bits2.da3src.src0_reg_nr = src0.nr;
 811    insn->bits1.da3src.src0_abs = src0.abs;
 812    insn->bits1.da3src.src0_negate = src0.negate;
 813    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 814
 815    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 816    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 817    assert(src1.nr < 128);
 818    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 819    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 820    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 821    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 822    insn->bits3.da3src.src1_reg_nr = src1.nr;
 823    insn->bits1.da3src.src1_abs = src1.abs;
 824    insn->bits1.da3src.src1_negate = src1.negate;
 825
 826    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 827    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 828    assert(src2.nr < 128);
 829    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 830    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 831    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 832    insn->bits3.da3src.src2_reg_nr = src2.nr;
 833    insn->bits1.da3src.src2_abs = src2.abs;
 834    insn->bits1.da3src.src2_negate = src2.negate;
 835
 836    if (brw->gen >= 7) {
 837       /* Set both the source and destination types based on dest.type,
 838        * ignoring the source register types.  The MAD and LRP emitters ensure
 839        * that all four types are float.  The BFE and BFI2 emitters, however,
 840        * may send us mixed D and UD types and want us to ignore that and use
 841        * the destination type.
 842        */
 843       switch (dest.type) {
 844       case BRW_REGISTER_TYPE_F:
 845          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
 846          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
 847          break;
 848       case BRW_REGISTER_TYPE_D:
 849          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
 850          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
 851          break;
 852       case BRW_REGISTER_TYPE_UD:
 853          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
 854          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
 855          break;
 856       }
 857    }
 858
 859    return insn;
 860 }
 861
 862
 863 /***********************************************************************
 864  * Convenience routines.
 865  */
 866 #define ALU1(OP)                                        \
 867 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 868               struct brw_reg dest,                      \
 869               struct brw_reg src0)                      \
 870 {                                                       \
 871    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 872 }
 873
 874 #define ALU2(OP)                                        \
 875 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 876               struct brw_reg dest,                      \
 877               struct brw_reg src0,                      \
 878               struct brw_reg src1)                      \
 879 {                                                       \
 880    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 881 }
 882
 883 #define ALU3(OP)                                        \
 884 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 885               struct brw_reg dest,                      \
 886               struct brw_reg src0,                      \
 887               struct brw_reg src1,                      \
 888               struct brw_reg src2)                      \
 889 {                                                       \
 890    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 891 }
 892
 893 #define ALU3F(OP)                                               \
 894 struct brw_instruction *brw_##OP(struct brw_compile *p,         \
 895                                  struct brw_reg dest,           \
 896                                  struct brw_reg src0,           \
 897                                  struct brw_reg src1,           \
 898                                  struct brw_reg src2)           \
 899 {                                                               \
 900    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
 901    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
 902    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
 903    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
 904    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 905 }
 906
 907 /* Rounding operations (other than RNDD) require two instructions - the first
 908  * stores a rounded value (possibly the wrong way) in the dest register, but
 909  * also sets a per-channel "increment bit" in the flag register.  A predicated
 910  * add of 1.0 fixes dest to contain the desired result.
 911  *
 912  * Sandybridge and later appear to round correctly without an ADD.
 913  */
 914 #define ROUND(OP)                                                             \
 915 void brw_##OP(struct brw_compile *p,                                          \
 916               struct brw_reg dest,                                            \
 917               struct brw_reg src)                                             \
 918 {                                                                             \
 919    struct brw_instruction *rnd, *add;                                         \
 920    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 921    brw_set_dest(p, rnd, dest);                                                \
 922    brw_set_src0(p, rnd, src);                                                 \
 923                                                                               \
 924    if (p->brw->gen < 6) {                                                     \
 925       /* turn on round-increments */                                          \
 926       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 927       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 928       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 929    }                                                                          \
 930 }
 931
 932
 933 ALU1(MOV)
 934 ALU2(SEL)
 935 ALU1(NOT)
 936 ALU2(AND)
 937 ALU2(OR)
 938 ALU2(XOR)
 939 ALU2(SHR)
 940 ALU2(SHL)
 941 ALU2(RSR)
 942 ALU2(RSL)
 943 ALU2(ASR)
 944 ALU1(F32TO16)
 945 ALU1(F16TO32)
 946 ALU1(FRC)
 947 ALU1(RNDD)
 948 ALU2(MAC)
 949 ALU2(MACH)
 950 ALU1(LZD)
 951 ALU2(DP4)
 952 ALU2(DPH)
 953 ALU2(DP3)
 954 ALU2(DP2)
 955 ALU2(LINE)
 956 ALU2(PLN)
 957 ALU3F(MAD)
 958 ALU3F(LRP)
 959 ALU1(BFREV)
 960 ALU3(BFE)
 961 ALU2(BFI1)
 962 ALU3(BFI2)
 963 ALU1(FBH)
 964 ALU1(FBL)
 965 ALU1(CBIT)
 966
 967 ROUND(RNDZ)
 968 ROUND(RNDE)
 969
 970
 971 struct brw_instruction *brw_ADD(struct brw_compile *p,
 972                                 struct brw_reg dest,
 973                                 struct brw_reg src0,
 974                                 struct brw_reg src1)
 975 {
 976    /* 6.2.2: add */
 977    if (src0.type == BRW_REGISTER_TYPE_F ||
 978        (src0.file == BRW_IMMEDIATE_VALUE &&
 979         src0.type == BRW_REGISTER_TYPE_VF)) {
 980       assert(src1.type != BRW_REGISTER_TYPE_UD);
 981       assert(src1.type != BRW_REGISTER_TYPE_D);
 982    }
 983
 984    if (src1.type == BRW_REGISTER_TYPE_F ||
 985        (src1.file == BRW_IMMEDIATE_VALUE &&
 986         src1.type == BRW_REGISTER_TYPE_VF)) {
 987       assert(src0.type != BRW_REGISTER_TYPE_UD);
 988       assert(src0.type != BRW_REGISTER_TYPE_D);
 989    }
 990
 991    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 992 }
 993
 994 struct brw_instruction *brw_AVG(struct brw_compile *p,
 995                                 struct brw_reg dest,
 996                                 struct brw_reg src0,
 997                                 struct brw_reg src1)
 998 {
 999    assert(dest.type == src0.type);
1000    assert(src0.type == src1.type);
1001    switch (src0.type) {
1002    case BRW_REGISTER_TYPE_B:
1003    case BRW_REGISTER_TYPE_UB:
1004    case BRW_REGISTER_TYPE_W:
1005    case BRW_REGISTER_TYPE_UW:
1006    case BRW_REGISTER_TYPE_D:
1007    case BRW_REGISTER_TYPE_UD:
1008       break;
1009    default:
1010       assert(!"Bad type for brw_AVG");
1011    }
1012
1013    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1014 }
1015
1016 struct brw_instruction *brw_MUL(struct brw_compile *p,
1017                                 struct brw_reg dest,
1018                                 struct brw_reg src0,
1019                                 struct brw_reg src1)
1020 {
1021    /* 6.32.38: mul */
1022    if (src0.type == BRW_REGISTER_TYPE_D ||
1023        src0.type == BRW_REGISTER_TYPE_UD ||
1024        src1.type == BRW_REGISTER_TYPE_D ||
1025        src1.type == BRW_REGISTER_TYPE_UD) {
1026       assert(dest.type != BRW_REGISTER_TYPE_F);
1027    }
1028
1029    if (src0.type == BRW_REGISTER_TYPE_F ||
1030        (src0.file == BRW_IMMEDIATE_VALUE &&
1031         src0.type == BRW_REGISTER_TYPE_VF)) {
1032       assert(src1.type != BRW_REGISTER_TYPE_UD);
1033       assert(src1.type != BRW_REGISTER_TYPE_D);
1034    }
1035
1036    if (src1.type == BRW_REGISTER_TYPE_F ||
1037        (src1.file == BRW_IMMEDIATE_VALUE &&
1038         src1.type == BRW_REGISTER_TYPE_VF)) {
1039       assert(src0.type != BRW_REGISTER_TYPE_UD);
1040       assert(src0.type != BRW_REGISTER_TYPE_D);
1041    }
1042
1043    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1044           src0.nr != BRW_ARF_ACCUMULATOR);
1045    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1046           src1.nr != BRW_ARF_ACCUMULATOR);
1047
1048    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1049 }
1050
1051
1052 void brw_NOP(struct brw_compile *p)
1053 {
1054    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1055    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1056    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1057    brw_set_src1(p, insn, brw_imm_ud(0x0));
1058 }
1059
1060
1061
1062
1063
1064 /***********************************************************************
1065  * Comparisons, if/else/endif
1066  */
1067
1068 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1069                                  struct brw_reg dest,
1070                                  struct brw_reg src0,
1071                                  struct brw_reg src1)
1072 {
1073    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1074
1075    insn->header.execution_size = 1;
1076    insn->header.compression_control = BRW_COMPRESSION_NONE;
1077    insn->header.mask_control = BRW_MASK_DISABLE;
1078
1079    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1080
1081    return insn;
1082 }
1083
1084 static void
1085 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1086 {
1087    p->if_stack[p->if_stack_depth] = inst - p->store;
1088
1089    p->if_stack_depth++;
1090    if (p->if_stack_array_size <= p->if_stack_depth) {
1091       p->if_stack_array_size *= 2;
1092       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1093                              p->if_stack_array_size);
1094    }
1095 }
1096
1097 static struct brw_instruction *
1098 pop_if_stack(struct brw_compile *p)
1099 {
1100    p->if_stack_depth--;
1101    return &p->store[p->if_stack[p->if_stack_depth]];
1102 }
1103
1104 static void
1105 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1106 {
1107    if (p->loop_stack_array_size < p->loop_stack_depth) {
1108       p->loop_stack_array_size *= 2;
1109       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1110                                p->loop_stack_array_size);
1111       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1112                                      p->loop_stack_array_size);
1113    }
1114
1115    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1116    p->loop_stack_depth++;
1117    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1118 }
1119
1120 static struct brw_instruction *
1121 get_inner_do_insn(struct brw_compile *p)
1122 {
1123    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1124 }
1125
1126 /* EU takes the value from the flag register and pushes it onto some
1127  * sort of a stack (presumably merging with any flag value already on
1128  * the stack).  Within an if block, the flags at the top of the stack
1129  * control execution on each channel of the unit, eg. on each of the
1130  * 16 pixel values in our wm programs.
1131  *
1132  * When the matching 'else' instruction is reached (presumably by
1133  * countdown of the instruction count patched in by our ELSE/ENDIF
1134  * functions), the relevent flags are inverted.
1135  *
1136  * When the matching 'endif' instruction is reached, the flags are
1137  * popped off.  If the stack is now empty, normal execution resumes.
1138  */
1139 struct brw_instruction *
1140 brw_IF(struct brw_compile *p, GLuint execute_size)
1141 {
1142    struct brw_context *brw = p->brw;
1143    struct brw_instruction *insn;
1144
1145    insn = next_insn(p, BRW_OPCODE_IF);
1146
1147    /* Override the defaults for this instruction:
1148     */
1149    if (brw->gen < 6) {
1150       brw_set_dest(p, insn, brw_ip_reg());
1151       brw_set_src0(p, insn, brw_ip_reg());
1152       brw_set_src1(p, insn, brw_imm_d(0x0));
1153    } else if (brw->gen == 6) {
1154       brw_set_dest(p, insn, brw_imm_w(0));
1155       insn->bits1.branch_gen6.jump_count = 0;
1156       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1157       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1158    } else {
1159       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1160       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1161       brw_set_src1(p, insn, brw_imm_ud(0));
1162       insn->bits3.break_cont.jip = 0;
1163       insn->bits3.break_cont.uip = 0;
1164    }
1165
1166    insn->header.execution_size = execute_size;
1167    insn->header.compression_control = BRW_COMPRESSION_NONE;
1168    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1169    insn->header.mask_control = BRW_MASK_ENABLE;
1170    if (!p->single_program_flow)
1171       insn->header.thread_control = BRW_THREAD_SWITCH;
1172
1173    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1174
1175    push_if_stack(p, insn);
1176    p->if_depth_in_loop[p->loop_stack_depth]++;
1177    return insn;
1178 }
1179
1180 /* This function is only used for gen6-style IF instructions with an
1181  * embedded comparison (conditional modifier).  It is not used on gen7.
1182  */
1183 struct brw_instruction *
1184 gen6_IF(struct brw_compile *p, uint32_t conditional,
1185         struct brw_reg src0, struct brw_reg src1)
1186 {
1187    struct brw_instruction *insn;
1188
1189    insn = next_insn(p, BRW_OPCODE_IF);
1190
1191    brw_set_dest(p, insn, brw_imm_w(0));
1192    if (p->compressed) {
1193       insn->header.execution_size = BRW_EXECUTE_16;
1194    } else {
1195       insn->header.execution_size = BRW_EXECUTE_8;
1196    }
1197    insn->bits1.branch_gen6.jump_count = 0;
1198    brw_set_src0(p, insn, src0);
1199    brw_set_src1(p, insn, src1);
1200
1201    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1202    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1203    insn->header.destreg__conditionalmod = conditional;
1204
1205    if (!p->single_program_flow)
1206       insn->header.thread_control = BRW_THREAD_SWITCH;
1207
1208    push_if_stack(p, insn);
1209    return insn;
1210 }
1211
1212 /**
1213  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1214  */
1215 static void
1216 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1217                        struct brw_instruction *if_inst,
1218                        struct brw_instruction *else_inst)
1219 {
1220    /* The next instruction (where the ENDIF would be, if it existed) */
1221    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1222
1223    assert(p->single_program_flow);
1224    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1225    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1226    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1227
1228    /* Convert IF to an ADD instruction that moves the instruction pointer
1229     * to the first instruction of the ELSE block.  If there is no ELSE
1230     * block, point to where ENDIF would be.  Reverse the predicate.
1231     *
1232     * There's no need to execute an ENDIF since we don't need to do any
1233     * stack operations, and if we're currently executing, we just want to
1234     * continue normally.
1235     */
1236    if_inst->header.opcode = BRW_OPCODE_ADD;
1237    if_inst->header.predicate_inverse = 1;
1238
1239    if (else_inst != NULL) {
1240       /* Convert ELSE to an ADD instruction that points where the ENDIF
1241        * would be.
1242        */
1243       else_inst->header.opcode = BRW_OPCODE_ADD;
1244
1245       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1246       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1247    } else {
1248       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1249    }
1250 }
1251
1252 /**
1253  * Patch IF and ELSE instructions with appropriate jump targets.
1254  */
1255 static void
1256 patch_IF_ELSE(struct brw_compile *p,
1257               struct brw_instruction *if_inst,
1258               struct brw_instruction *else_inst,
1259               struct brw_instruction *endif_inst)
1260 {
1261    struct brw_context *brw = p->brw;
1262
1263    /* We shouldn't be patching IF and ELSE instructions in single program flow
1264     * mode when gen < 6, because in single program flow mode on those
1265     * platforms, we convert flow control instructions to conditional ADDs that
1266     * operate on IP (see brw_ENDIF).
1267     *
1268     * However, on Gen6, writing to IP doesn't work in single program flow mode
1269     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1270     * not be updated by non-flow control instructions.").  And on later
1271     * platforms, there is no significant benefit to converting control flow
1272     * instructions to conditional ADDs.  So we do patch IF and ELSE
1273     * instructions in single program flow mode on those platforms.
1274     */
1275    if (brw->gen < 6)
1276       assert(!p->single_program_flow);
1277
1278    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1279    assert(endif_inst != NULL);
1280    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1281
1282    unsigned br = 1;
1283    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1284     * requires 2 chunks.
1285     */
1286    if (brw->gen >= 5)
1287       br = 2;
1288
1289    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1290    endif_inst->header.execution_size = if_inst->header.execution_size;
1291
1292    if (else_inst == NULL) {
1293       /* Patch IF -> ENDIF */
1294       if (brw->gen < 6) {
1295          /* Turn it into an IFF, which means no mask stack operations for
1296           * all-false and jumping past the ENDIF.
1297           */
1298          if_inst->header.opcode = BRW_OPCODE_IFF;
1299          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1300          if_inst->bits3.if_else.pop_count = 0;
1301          if_inst->bits3.if_else.pad0 = 0;
1302       } else if (brw->gen == 6) {
1303          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1304          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1305       } else {
1306          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1307          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1308       }
1309    } else {
1310       else_inst->header.execution_size = if_inst->header.execution_size;
1311
1312       /* Patch IF -> ELSE */
1313       if (brw->gen < 6) {
1314          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1315          if_inst->bits3.if_else.pop_count = 0;
1316          if_inst->bits3.if_else.pad0 = 0;
1317       } else if (brw->gen == 6) {
1318          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1319       }
1320
1321       /* Patch ELSE -> ENDIF */
1322       if (brw->gen < 6) {
1323          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1324           * matching ENDIF.
1325           */
1326          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1327          else_inst->bits3.if_else.pop_count = 1;
1328          else_inst->bits3.if_else.pad0 = 0;
1329       } else if (brw->gen == 6) {
1330          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1331          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1332       } else {
1333          /* The IF instruction's JIP should point just past the ELSE */
1334          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1335          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1336          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1337          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1338       }
1339    }
1340 }
1341
1342 void
1343 brw_ELSE(struct brw_compile *p)
1344 {
1345    struct brw_context *brw = p->brw;
1346    struct brw_instruction *insn;
1347
1348    insn = next_insn(p, BRW_OPCODE_ELSE);
1349
1350    if (brw->gen < 6) {
1351       brw_set_dest(p, insn, brw_ip_reg());
1352       brw_set_src0(p, insn, brw_ip_reg());
1353       brw_set_src1(p, insn, brw_imm_d(0x0));
1354    } else if (brw->gen == 6) {
1355       brw_set_dest(p, insn, brw_imm_w(0));
1356       insn->bits1.branch_gen6.jump_count = 0;
1357       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1358       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1359    } else {
1360       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1361       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1362       brw_set_src1(p, insn, brw_imm_ud(0));
1363       insn->bits3.break_cont.jip = 0;
1364       insn->bits3.break_cont.uip = 0;
1365    }
1366
1367    insn->header.compression_control = BRW_COMPRESSION_NONE;
1368    insn->header.mask_control = BRW_MASK_ENABLE;
1369    if (!p->single_program_flow)
1370       insn->header.thread_control = BRW_THREAD_SWITCH;
1371
1372    push_if_stack(p, insn);
1373 }
1374
1375 void
1376 brw_ENDIF(struct brw_compile *p)
1377 {
1378    struct brw_context *brw = p->brw;
1379    struct brw_instruction *insn = NULL;
1380    struct brw_instruction *else_inst = NULL;
1381    struct brw_instruction *if_inst = NULL;
1382    struct brw_instruction *tmp;
1383    bool emit_endif = true;
1384
1385    /* In single program flow mode, we can express IF and ELSE instructions
1386     * equivalently as ADD instructions that operate on IP.  On platforms prior
1387     * to Gen6, flow control instructions cause an implied thread switch, so
1388     * this is a significant savings.
1389     *
1390     * However, on Gen6, writing to IP doesn't work in single program flow mode
1391     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1392     * not be updated by non-flow control instructions.").  And on later
1393     * platforms, there is no significant benefit to converting control flow
1394     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1395     * Gen5.
1396     */
1397    if (brw->gen < 6 && p->single_program_flow)
1398       emit_endif = false;
1399
1400    /*
1401     * A single next_insn() may change the base adress of instruction store
1402     * memory(p->store), so call it first before referencing the instruction
1403     * store pointer from an index
1404     */
1405    if (emit_endif)
1406       insn = next_insn(p, BRW_OPCODE_ENDIF);
1407
1408    /* Pop the IF and (optional) ELSE instructions from the stack */
1409    p->if_depth_in_loop[p->loop_stack_depth]--;
1410    tmp = pop_if_stack(p);
1411    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1412       else_inst = tmp;
1413       tmp = pop_if_stack(p);
1414    }
1415    if_inst = tmp;
1416
1417    if (!emit_endif) {
1418       /* ENDIF is useless; don't bother emitting it. */
1419       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1420       return;
1421    }
1422
1423    if (brw->gen < 6) {
1424       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1425       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1426       brw_set_src1(p, insn, brw_imm_d(0x0));
1427    } else if (brw->gen == 6) {
1428       brw_set_dest(p, insn, brw_imm_w(0));
1429       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1430       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1431    } else {
1432       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1433       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1434       brw_set_src1(p, insn, brw_imm_ud(0));
1435    }
1436
1437    insn->header.compression_control = BRW_COMPRESSION_NONE;
1438    insn->header.mask_control = BRW_MASK_ENABLE;
1439    insn->header.thread_control = BRW_THREAD_SWITCH;
1440
1441    /* Also pop item off the stack in the endif instruction: */
1442    if (brw->gen < 6) {
1443       insn->bits3.if_else.jump_count = 0;
1444       insn->bits3.if_else.pop_count = 1;
1445       insn->bits3.if_else.pad0 = 0;
1446    } else if (brw->gen == 6) {
1447       insn->bits1.branch_gen6.jump_count = 2;
1448    } else {
1449       insn->bits3.break_cont.jip = 2;
1450    }
1451    patch_IF_ELSE(p, if_inst, else_inst, insn);
1452 }
1453
1454 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1455 {
1456    struct brw_context *brw = p->brw;
1457    struct brw_instruction *insn;
1458
1459    insn = next_insn(p, BRW_OPCODE_BREAK);
1460    if (brw->gen >= 6) {
1461       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1462       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1463       brw_set_src1(p, insn, brw_imm_d(0x0));
1464    } else {
1465       brw_set_dest(p, insn, brw_ip_reg());
1466       brw_set_src0(p, insn, brw_ip_reg());
1467       brw_set_src1(p, insn, brw_imm_d(0x0));
1468       insn->bits3.if_else.pad0 = 0;
1469       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1470    }
1471    insn->header.compression_control = BRW_COMPRESSION_NONE;
1472    insn->header.execution_size = BRW_EXECUTE_8;
1473
1474    return insn;
1475 }
1476
1477 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1478 {
1479    struct brw_instruction *insn;
1480
1481    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1482    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1483    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1484    brw_set_dest(p, insn, brw_ip_reg());
1485    brw_set_src0(p, insn, brw_ip_reg());
1486    brw_set_src1(p, insn, brw_imm_d(0x0));
1487
1488    insn->header.compression_control = BRW_COMPRESSION_NONE;
1489    insn->header.execution_size = BRW_EXECUTE_8;
1490    return insn;
1491 }
1492
1493 struct brw_instruction *brw_CONT(struct brw_compile *p)
1494 {
1495    struct brw_instruction *insn;
1496    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1497    brw_set_dest(p, insn, brw_ip_reg());
1498    brw_set_src0(p, insn, brw_ip_reg());
1499    brw_set_src1(p, insn, brw_imm_d(0x0));
1500    insn->header.compression_control = BRW_COMPRESSION_NONE;
1501    insn->header.execution_size = BRW_EXECUTE_8;
1502    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1503    insn->bits3.if_else.pad0 = 0;
1504    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1505    return insn;
1506 }
1507
1508 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1509 {
1510    struct brw_instruction *insn;
1511
1512    insn = next_insn(p, BRW_OPCODE_HALT);
1513    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1514    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1515    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1516
1517    if (p->compressed) {
1518       insn->header.execution_size = BRW_EXECUTE_16;
1519    } else {
1520       insn->header.compression_control = BRW_COMPRESSION_NONE;
1521       insn->header.execution_size = BRW_EXECUTE_8;
1522    }
1523    return insn;
1524 }
1525
1526 /* DO/WHILE loop:
1527  *
1528  * The DO/WHILE is just an unterminated loop -- break or continue are
1529  * used for control within the loop.  We have a few ways they can be
1530  * done.
1531  *
1532  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1533  * jip and no DO instruction.
1534  *
1535  * For non-uniform control flow pre-gen6, there's a DO instruction to
1536  * push the mask, and a WHILE to jump back, and BREAK to get out and
1537  * pop the mask.
1538  *
1539  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1540  * just points back to the first instruction of the loop.
1541  */
1542 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1543 {
1544    struct brw_context *brw = p->brw;
1545
1546    if (brw->gen >= 6 || p->single_program_flow) {
1547       push_loop_stack(p, &p->store[p->nr_insn]);
1548       return &p->store[p->nr_insn];
1549    } else {
1550       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1551
1552       push_loop_stack(p, insn);
1553
1554       /* Override the defaults for this instruction:
1555        */
1556       brw_set_dest(p, insn, brw_null_reg());
1557       brw_set_src0(p, insn, brw_null_reg());
1558       brw_set_src1(p, insn, brw_null_reg());
1559
1560       insn->header.compression_control = BRW_COMPRESSION_NONE;
1561       insn->header.execution_size = execute_size;
1562       insn->header.predicate_control = BRW_PREDICATE_NONE;
1563       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1564       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1565
1566       return insn;
1567    }
1568 }
1569
1570 /**
1571  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1572  * instruction here.
1573  *
1574  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1575  * nesting, since it can always just point to the end of the block/current loop.
1576  */
1577 static void
1578 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1579 {
1580    struct brw_context *brw = p->brw;
1581    struct brw_instruction *do_inst = get_inner_do_insn(p);
1582    struct brw_instruction *inst;
1583    int br = (brw->gen == 5) ? 2 : 1;
1584
1585    for (inst = while_inst - 1; inst != do_inst; inst--) {
1586       /* If the jump count is != 0, that means that this instruction has already
1587        * been patched because it's part of a loop inside of the one we're
1588        * patching.
1589        */
1590       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1591           inst->bits3.if_else.jump_count == 0) {
1592          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1593       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1594                  inst->bits3.if_else.jump_count == 0) {
1595          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1596       }
1597    }
1598 }
1599
1600 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1601 {
1602    struct brw_context *brw = p->brw;
1603    struct brw_instruction *insn, *do_insn;
1604    GLuint br = 1;
1605
1606    if (brw->gen >= 5)
1607       br = 2;
1608
1609    if (brw->gen >= 7) {
1610       insn = next_insn(p, BRW_OPCODE_WHILE);
1611       do_insn = get_inner_do_insn(p);
1612
1613       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1615       brw_set_src1(p, insn, brw_imm_ud(0));
1616       insn->bits3.break_cont.jip = br * (do_insn - insn);
1617
1618       insn->header.execution_size = BRW_EXECUTE_8;
1619    } else if (brw->gen == 6) {
1620       insn = next_insn(p, BRW_OPCODE_WHILE);
1621       do_insn = get_inner_do_insn(p);
1622
1623       brw_set_dest(p, insn, brw_imm_w(0));
1624       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1625       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1626       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1627
1628       insn->header.execution_size = BRW_EXECUTE_8;
1629    } else {
1630       if (p->single_program_flow) {
1631          insn = next_insn(p, BRW_OPCODE_ADD);
1632          do_insn = get_inner_do_insn(p);
1633
1634          brw_set_dest(p, insn, brw_ip_reg());
1635          brw_set_src0(p, insn, brw_ip_reg());
1636          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1637          insn->header.execution_size = BRW_EXECUTE_1;
1638       } else {
1639          insn = next_insn(p, BRW_OPCODE_WHILE);
1640          do_insn = get_inner_do_insn(p);
1641
1642          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1643
1644          brw_set_dest(p, insn, brw_ip_reg());
1645          brw_set_src0(p, insn, brw_ip_reg());
1646          brw_set_src1(p, insn, brw_imm_d(0));
1647
1648          insn->header.execution_size = do_insn->header.execution_size;
1649          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1650          insn->bits3.if_else.pop_count = 0;
1651          insn->bits3.if_else.pad0 = 0;
1652
1653          brw_patch_break_cont(p, insn);
1654       }
1655    }
1656    insn->header.compression_control = BRW_COMPRESSION_NONE;
1657    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1658
1659    p->loop_stack_depth--;
1660
1661    return insn;
1662 }
1663
1664
1665 /* FORWARD JUMPS:
1666  */
1667 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1668 {
1669    struct brw_context *brw = p->brw;
1670    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1671    GLuint jmpi = 1;
1672
1673    if (brw->gen >= 5)
1674       jmpi = 2;
1675
1676    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1677    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1678
1679    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1680 }
1681
1682
1683
1684 /* To integrate with the above, it makes sense that the comparison
1685  * instruction should populate the flag register.  It might be simpler
1686  * just to use the flag reg for most WM tasks?
1687  */
1688 void brw_CMP(struct brw_compile *p,
1689              struct brw_reg dest,
1690              GLuint conditional,
1691              struct brw_reg src0,
1692              struct brw_reg src1)
1693 {
1694    struct brw_context *brw = p->brw;
1695    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1696
1697    insn->header.destreg__conditionalmod = conditional;
1698    brw_set_dest(p, insn, dest);
1699    brw_set_src0(p, insn, src0);
1700    brw_set_src1(p, insn, src1);
1701
1702 /*    guess_execution_size(insn, src0); */
1703
1704
1705    /* Make it so that future instructions will use the computed flag
1706     * value until brw_set_predicate_control_flag_value() is called
1707     * again.
1708     */
1709    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1710        dest.nr == 0) {
1711       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1712       p->flag_value = 0xff;
1713    }
1714
1715    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1716     * page says:
1717     *    "Any CMP instruction with a null destination must use a {switch}."
1718     *
1719     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1720     * mentioned on their work-arounds pages.
1721     */
1722    if (brw->gen == 7) {
1723       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1724           dest.nr == BRW_ARF_NULL) {
1725          insn->header.thread_control = BRW_THREAD_SWITCH;
1726       }
1727    }
1728 }
1729
1730 /* Issue 'wait' instruction for n1, host could program MMIO
1731    to wake up thread. */
1732 void brw_WAIT (struct brw_compile *p)
1733 {
1734    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1735    struct brw_reg src = brw_notification_1_reg();
1736
1737    brw_set_dest(p, insn, src);
1738    brw_set_src0(p, insn, src);
1739    brw_set_src1(p, insn, brw_null_reg());
1740    insn->header.execution_size = 0; /* must */
1741    insn->header.predicate_control = 0;
1742    insn->header.compression_control = 0;
1743 }
1744
1745
1746 /***********************************************************************
1747  * Helpers for the various SEND message types:
1748  */
1749
1750 /** Extended math function, float[8].
1751  */
1752 void brw_math( struct brw_compile *p,
1753                struct brw_reg dest,
1754                GLuint function,
1755                GLuint msg_reg_nr,
1756                struct brw_reg src,
1757                GLuint data_type,
1758                GLuint precision )
1759 {
1760    struct brw_context *brw = p->brw;
1761
1762    if (brw->gen >= 6) {
1763       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1764
1765       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1766              (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1767       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1768
1769       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1770       if (brw->gen == 6)
1771          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1772
1773       /* Source modifiers are ignored for extended math instructions on Gen6. */
1774       if (brw->gen == 6) {
1775          assert(!src.negate);
1776          assert(!src.abs);
1777       }
1778
1779       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1780           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1781           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1782          assert(src.type != BRW_REGISTER_TYPE_F);
1783       } else {
1784          assert(src.type == BRW_REGISTER_TYPE_F);
1785       }
1786
1787       /* Math is the same ISA format as other opcodes, except that CondModifier
1788        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1789        */
1790       insn->header.destreg__conditionalmod = function;
1791
1792       brw_set_dest(p, insn, dest);
1793       brw_set_src0(p, insn, src);
1794       brw_set_src1(p, insn, brw_null_reg());
1795    } else {
1796       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1797
1798       /* Example code doesn't set predicate_control for send
1799        * instructions.
1800        */
1801       insn->header.predicate_control = 0;
1802       insn->header.destreg__conditionalmod = msg_reg_nr;
1803
1804       brw_set_dest(p, insn, dest);
1805       brw_set_src0(p, insn, src);
1806       brw_set_math_message(p,
1807                            insn,
1808                            function,
1809                            src.type == BRW_REGISTER_TYPE_D,
1810                            precision,
1811                            data_type);
1812    }
1813 }
1814
1815 /** Extended math function, float[8].
1816  */
1817 void brw_math2(struct brw_compile *p,
1818                struct brw_reg dest,
1819                GLuint function,
1820                struct brw_reg src0,
1821                struct brw_reg src1)
1822 {
1823    struct brw_context *brw = p->brw;
1824    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1825
1826    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1827           (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1828    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1829    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1830
1831    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1832    if (brw->gen == 6) {
1833       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1834       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1835    }
1836
1837    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1838        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1839        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1840       assert(src0.type != BRW_REGISTER_TYPE_F);
1841       assert(src1.type != BRW_REGISTER_TYPE_F);
1842    } else {
1843       assert(src0.type == BRW_REGISTER_TYPE_F);
1844       assert(src1.type == BRW_REGISTER_TYPE_F);
1845    }
1846
1847    /* Source modifiers are ignored for extended math instructions on Gen6. */
1848    if (brw->gen == 6) {
1849       assert(!src0.negate);
1850       assert(!src0.abs);
1851       assert(!src1.negate);
1852       assert(!src1.abs);
1853    }
1854
1855    /* Math is the same ISA format as other opcodes, except that CondModifier
1856     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1857     */
1858    insn->header.destreg__conditionalmod = function;
1859
1860    brw_set_dest(p, insn, dest);
1861    brw_set_src0(p, insn, src0);
1862    brw_set_src1(p, insn, src1);
1863 }
1864
1865
1866 /**
1867  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1868  * using a constant offset per channel.
1869  *
1870  * The offset must be aligned to oword size (16 bytes).  Used for
1871  * register spilling.
1872  */
1873 void brw_oword_block_write_scratch(struct brw_compile *p,
1874                                    struct brw_reg mrf,
1875                                    int num_regs,
1876                                    GLuint offset)
1877 {
1878    struct brw_context *brw = p->brw;
1879    uint32_t msg_control, msg_type;
1880    int mlen;
1881
1882    if (brw->gen >= 6)
1883       offset /= 16;
1884
1885    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1886
1887    if (num_regs == 1) {
1888       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1889       mlen = 2;
1890    } else {
1891       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1892       mlen = 3;
1893    }
1894
1895    /* Set up the message header.  This is g0, with g0.2 filled with
1896     * the offset.  We don't want to leave our offset around in g0 or
1897     * it'll screw up texture samples, so set it up inside the message
1898     * reg.
1899     */
1900    {
1901       brw_push_insn_state(p);
1902       brw_set_mask_control(p, BRW_MASK_DISABLE);
1903       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1904
1905       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1906
1907       /* set message header global offset field (reg 0, element 2) */
1908       brw_MOV(p,
1909               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1910                                   mrf.nr,
1911                                   2), BRW_REGISTER_TYPE_UD),
1912               brw_imm_ud(offset));
1913
1914       brw_pop_insn_state(p);
1915    }
1916
1917    {
1918       struct brw_reg dest;
1919       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1920       int send_commit_msg;
1921       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1922                                          BRW_REGISTER_TYPE_UW);
1923
1924       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1925          insn->header.compression_control = BRW_COMPRESSION_NONE;
1926          src_header = vec16(src_header);
1927       }
1928       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1929       insn->header.destreg__conditionalmod = mrf.nr;
1930
1931       /* Until gen6, writes followed by reads from the same location
1932        * are not guaranteed to be ordered unless write_commit is set.
1933        * If set, then a no-op write is issued to the destination
1934        * register to set a dependency, and a read from the destination
1935        * can be used to ensure the ordering.
1936        *
1937        * For gen6, only writes between different threads need ordering
1938        * protection.  Our use of DP writes is all about register
1939        * spilling within a thread.
1940        */
1941       if (brw->gen >= 6) {
1942          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1943          send_commit_msg = 0;
1944       } else {
1945          dest = src_header;
1946          send_commit_msg = 1;
1947       }
1948
1949       brw_set_dest(p, insn, dest);
1950       if (brw->gen >= 6) {
1951          brw_set_src0(p, insn, mrf);
1952       } else {
1953          brw_set_src0(p, insn, brw_null_reg());
1954       }
1955
1956       if (brw->gen >= 6)
1957          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1958       else
1959          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1960
1961       brw_set_dp_write_message(p,
1962                                insn,
1963                                255, /* binding table index (255=stateless) */
1964                                msg_control,
1965                                msg_type,
1966                                mlen,
1967                                true, /* header_present */
1968                                0, /* not a render target */
1969                                send_commit_msg, /* response_length */
1970                                0, /* eot */
1971                                send_commit_msg);
1972    }
1973 }
1974
1975
1976 /**
1977  * Read a block of owords (half a GRF each) from the scratch buffer
1978  * using a constant index per channel.
1979  *
1980  * Offset must be aligned to oword size (16 bytes).  Used for register
1981  * spilling.
1982  */
1983 void
1984 brw_oword_block_read_scratch(struct brw_compile *p,
1985                              struct brw_reg dest,
1986                              struct brw_reg mrf,
1987                              int num_regs,
1988                              GLuint offset)
1989 {
1990    struct brw_context *brw = p->brw;
1991    uint32_t msg_control;
1992    int rlen;
1993
1994    if (brw->gen >= 6)
1995       offset /= 16;
1996
1997    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1998    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1999
2000    if (num_regs == 1) {
2001       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2002       rlen = 1;
2003    } else {
2004       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2005       rlen = 2;
2006    }
2007
2008    {
2009       brw_push_insn_state(p);
2010       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2011       brw_set_mask_control(p, BRW_MASK_DISABLE);
2012
2013       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2014
2015       /* set message header global offset field (reg 0, element 2) */
2016       brw_MOV(p,
2017               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2018                                   mrf.nr,
2019                                   2), BRW_REGISTER_TYPE_UD),
2020               brw_imm_ud(offset));
2021
2022       brw_pop_insn_state(p);
2023    }
2024
2025    {
2026       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2027
2028       assert(insn->header.predicate_control == 0);
2029       insn->header.compression_control = BRW_COMPRESSION_NONE;
2030       insn->header.destreg__conditionalmod = mrf.nr;
2031
2032       brw_set_dest(p, insn, dest);      /* UW? */
2033       if (brw->gen >= 6) {
2034          brw_set_src0(p, insn, mrf);
2035       } else {
2036          brw_set_src0(p, insn, brw_null_reg());
2037       }
2038
2039       brw_set_dp_read_message(p,
2040                               insn,
2041                               255, /* binding table index (255=stateless) */
2042                               msg_control,
2043                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2044                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2045                               1, /* msg_length */
2046                               true, /* header_present */
2047                               rlen);
2048    }
2049 }
2050
2051 /**
2052  * Read a float[4] vector from the data port Data Cache (const buffer).
2053  * Location (in buffer) should be a multiple of 16.
2054  * Used for fetching shader constants.
2055  */
2056 void brw_oword_block_read(struct brw_compile *p,
2057                           struct brw_reg dest,
2058                           struct brw_reg mrf,
2059                           uint32_t offset,
2060                           uint32_t bind_table_index)
2061 {
2062    struct brw_context *brw = p->brw;
2063
2064    /* On newer hardware, offset is in units of owords. */
2065    if (brw->gen >= 6)
2066       offset /= 16;
2067
2068    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2069
2070    brw_push_insn_state(p);
2071    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2072    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2073    brw_set_mask_control(p, BRW_MASK_DISABLE);
2074
2075    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2076
2077    /* set message header global offset field (reg 0, element 2) */
2078    brw_MOV(p,
2079            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2080                                mrf.nr,
2081                                2), BRW_REGISTER_TYPE_UD),
2082            brw_imm_ud(offset));
2083
2084    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2085    insn->header.destreg__conditionalmod = mrf.nr;
2086
2087    /* cast dest to a uword[8] vector */
2088    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2089
2090    brw_set_dest(p, insn, dest);
2091    if (brw->gen >= 6) {
2092       brw_set_src0(p, insn, mrf);
2093    } else {
2094       brw_set_src0(p, insn, brw_null_reg());
2095    }
2096
2097    brw_set_dp_read_message(p,
2098                            insn,
2099                            bind_table_index,
2100                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2101                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2102                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2103                            1, /* msg_length */
2104                            true, /* header_present */
2105                            1); /* response_length (1 reg, 2 owords!) */
2106
2107    brw_pop_insn_state(p);
2108 }
2109
2110
2111 void brw_fb_WRITE(struct brw_compile *p,
2112                   int dispatch_width,
2113                   GLuint msg_reg_nr,
2114                   struct brw_reg src0,
2115                   GLuint msg_control,
2116                   GLuint binding_table_index,
2117                   GLuint msg_length,
2118                   GLuint response_length,
2119                   bool eot,
2120                   bool header_present)
2121 {
2122    struct brw_context *brw = p->brw;
2123    struct brw_instruction *insn;
2124    GLuint msg_type;
2125    struct brw_reg dest;
2126
2127    if (dispatch_width == 16)
2128       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2129    else
2130       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2131
2132    if (brw->gen >= 6) {
2133       insn = next_insn(p, BRW_OPCODE_SENDC);
2134    } else {
2135       insn = next_insn(p, BRW_OPCODE_SEND);
2136    }
2137    /* The execution mask is ignored for render target writes. */
2138    insn->header.predicate_control = 0;
2139    insn->header.compression_control = BRW_COMPRESSION_NONE;
2140
2141    if (brw->gen >= 6) {
2142       /* headerless version, just submit color payload */
2143       src0 = brw_message_reg(msg_reg_nr);
2144
2145       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2146    } else {
2147       insn->header.destreg__conditionalmod = msg_reg_nr;
2148
2149       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2150    }
2151
2152    brw_set_dest(p, insn, dest);
2153    brw_set_src0(p, insn, src0);
2154    brw_set_dp_write_message(p,
2155                             insn,
2156                             binding_table_index,
2157                             msg_control,
2158                             msg_type,
2159                             msg_length,
2160                             header_present,
2161                             eot, /* last render target write */
2162                             response_length,
2163                             eot,
2164                             0 /* send_commit_msg */);
2165 }
2166
2167
2168 /**
2169  * Texture sample instruction.
2170  * Note: the msg_type plus msg_length values determine exactly what kind
2171  * of sampling operation is performed.  See volume 4, page 161 of docs.
2172  */
2173 void brw_SAMPLE(struct brw_compile *p,
2174                 struct brw_reg dest,
2175                 GLuint msg_reg_nr,
2176                 struct brw_reg src0,
2177                 GLuint binding_table_index,
2178                 GLuint sampler,
2179                 GLuint msg_type,
2180                 GLuint response_length,
2181                 GLuint msg_length,
2182                 GLuint header_present,
2183                 GLuint simd_mode,
2184                 GLuint return_format)
2185 {
2186    struct brw_context *brw = p->brw;
2187    struct brw_instruction *insn;
2188
2189    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2190
2191    insn = next_insn(p, BRW_OPCODE_SEND);
2192    insn->header.predicate_control = 0; /* XXX */
2193    insn->header.compression_control = BRW_COMPRESSION_NONE;
2194    if (brw->gen < 6)
2195       insn->header.destreg__conditionalmod = msg_reg_nr;
2196
2197    brw_set_dest(p, insn, dest);
2198    brw_set_src0(p, insn, src0);
2199    brw_set_sampler_message(p, insn,
2200                            binding_table_index,
2201                            sampler,
2202                            msg_type,
2203                            response_length,
2204                            msg_length,
2205                            header_present,
2206                            simd_mode,
2207                            return_format);
2208 }
2209
2210 /* All these variables are pretty confusing - we might be better off
2211  * using bitmasks and macros for this, in the old style.  Or perhaps
2212  * just having the caller instantiate the fields in dword3 itself.
2213  */
2214 void brw_urb_WRITE(struct brw_compile *p,
2215                    struct brw_reg dest,
2216                    GLuint msg_reg_nr,
2217                    struct brw_reg src0,
2218                    bool allocate,
2219                    bool used,
2220                    GLuint msg_length,
2221                    GLuint response_length,
2222                    bool eot,
2223                    bool writes_complete,
2224                    GLuint offset,
2225                    GLuint swizzle)
2226 {
2227    struct brw_context *brw = p->brw;
2228    struct brw_instruction *insn;
2229
2230    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2231
2232    if (brw->gen == 7) {
2233       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2234       brw_push_insn_state(p);
2235       brw_set_access_mode(p, BRW_ALIGN_1);
2236       brw_set_mask_control(p, BRW_MASK_DISABLE);
2237       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2238                        BRW_REGISTER_TYPE_UD),
2239                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2240                 brw_imm_ud(0xff00));
2241       brw_pop_insn_state(p);
2242    }
2243
2244    insn = next_insn(p, BRW_OPCODE_SEND);
2245
2246    assert(msg_length < BRW_MAX_MRF);
2247
2248    brw_set_dest(p, insn, dest);
2249    brw_set_src0(p, insn, src0);
2250    brw_set_src1(p, insn, brw_imm_d(0));
2251
2252    if (brw->gen < 6)
2253       insn->header.destreg__conditionalmod = msg_reg_nr;
2254
2255    brw_set_urb_message(p,
2256                        insn,
2257                        allocate,
2258                        used,
2259                        msg_length,
2260                        response_length,
2261                        eot,
2262                        writes_complete,
2263                        offset,
2264                        swizzle);
2265 }
2266
2267 static int
2268 next_ip(struct brw_compile *p, int ip)
2269 {
2270    struct brw_instruction *insn = (void *)p->store + ip;
2271
2272    if (insn->header.cmpt_control)
2273       return ip + 8;
2274    else
2275       return ip + 16;
2276 }
2277
2278 static int
2279 brw_find_next_block_end(struct brw_compile *p, int start)
2280 {
2281    int ip;
2282    void *store = p->store;
2283
2284    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2285       struct brw_instruction *insn = store + ip;
2286
2287       switch (insn->header.opcode) {
2288       case BRW_OPCODE_ENDIF:
2289       case BRW_OPCODE_ELSE:
2290       case BRW_OPCODE_WHILE:
2291       case BRW_OPCODE_HALT:
2292          return ip;
2293       }
2294    }
2295
2296    return 0;
2297 }
2298
2299 /* There is no DO instruction on gen6, so to find the end of the loop
2300  * we have to see if the loop is jumping back before our start
2301  * instruction.
2302  */
2303 static int
2304 brw_find_loop_end(struct brw_compile *p, int start)
2305 {
2306    struct brw_context *brw = p->brw;
2307    int ip;
2308    int scale = 8;
2309    void *store = p->store;
2310
2311    /* Always start after the instruction (such as a WHILE) we're trying to fix
2312     * up.
2313     */
2314    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2315       struct brw_instruction *insn = store + ip;
2316
2317       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2318          int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2319                                    : insn->bits3.break_cont.jip;
2320          if (ip + jip * scale <= start)
2321             return ip;
2322       }
2323    }
2324    assert(!"not reached");
2325    return start;
2326 }
2327
2328 /* After program generation, go back and update the UIP and JIP of
2329  * BREAK, CONT, and HALT instructions to their correct locations.
2330  */
2331 void
2332 brw_set_uip_jip(struct brw_compile *p)
2333 {
2334    struct brw_context *brw = p->brw;
2335    int ip;
2336    int scale = 8;
2337    void *store = p->store;
2338
2339    if (brw->gen < 6)
2340       return;
2341
2342    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2343       struct brw_instruction *insn = store + ip;
2344
2345       if (insn->header.cmpt_control) {
2346          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2347          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2348                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2349                 insn->header.opcode != BRW_OPCODE_HALT);
2350          continue;
2351       }
2352
2353       int block_end_ip = brw_find_next_block_end(p, ip);
2354       switch (insn->header.opcode) {
2355       case BRW_OPCODE_BREAK:
2356          assert(block_end_ip != 0);
2357          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2358          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2359          insn->bits3.break_cont.uip =
2360             (brw_find_loop_end(p, ip) - ip +
2361              (brw->gen == 6 ? 16 : 0)) / scale;
2362          break;
2363       case BRW_OPCODE_CONTINUE:
2364          assert(block_end_ip != 0);
2365          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2366          insn->bits3.break_cont.uip =
2367             (brw_find_loop_end(p, ip) - ip) / scale;
2368
2369          assert(insn->bits3.break_cont.uip != 0);
2370          assert(insn->bits3.break_cont.jip != 0);
2371          break;
2372
2373       case BRW_OPCODE_ENDIF:
2374          if (block_end_ip == 0)
2375             insn->bits3.break_cont.jip = 2;
2376          else
2377             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2378          break;
2379
2380       case BRW_OPCODE_HALT:
2381          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2382           *
2383           *    "In case of the halt instruction not inside any conditional
2384           *     code block, the value of <JIP> and <UIP> should be the
2385           *     same. In case of the halt instruction inside conditional code
2386           *     block, the <UIP> should be the end of the program, and the
2387           *     <JIP> should be end of the most inner conditional code block."
2388           *
2389           * The uip will have already been set by whoever set up the
2390           * instruction.
2391           */
2392          if (block_end_ip == 0) {
2393             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2394          } else {
2395             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2396          }
2397          assert(insn->bits3.break_cont.uip != 0);
2398          assert(insn->bits3.break_cont.jip != 0);
2399          break;
2400       }
2401    }
2402 }
2403
2404 void brw_ff_sync(struct brw_compile *p,
2405                    struct brw_reg dest,
2406                    GLuint msg_reg_nr,
2407                    struct brw_reg src0,
2408                    bool allocate,
2409                    GLuint response_length,
2410                    bool eot)
2411 {
2412    struct brw_context *brw = p->brw;
2413    struct brw_instruction *insn;
2414
2415    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2416
2417    insn = next_insn(p, BRW_OPCODE_SEND);
2418    brw_set_dest(p, insn, dest);
2419    brw_set_src0(p, insn, src0);
2420    brw_set_src1(p, insn, brw_imm_d(0));
2421
2422    if (brw->gen < 6)
2423       insn->header.destreg__conditionalmod = msg_reg_nr;
2424
2425    brw_set_ff_sync_message(p,
2426                            insn,
2427                            allocate,
2428                            response_length,
2429                            eot);
2430 }
2431
2432 /**
2433  * Emit the SEND instruction necessary to generate stream output data on Gen6
2434  * (for transform feedback).
2435  *
2436  * If send_commit_msg is true, this is the last piece of stream output data
2437  * from this thread, so send the data as a committed write.  According to the
2438  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2439  *
2440  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2441  *   writes are complete by sending the final write as a committed write."
2442  */
2443 void
2444 brw_svb_write(struct brw_compile *p,
2445               struct brw_reg dest,
2446               GLuint msg_reg_nr,
2447               struct brw_reg src0,
2448               GLuint binding_table_index,
2449               bool   send_commit_msg)
2450 {
2451    struct brw_instruction *insn;
2452
2453    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2454
2455    insn = next_insn(p, BRW_OPCODE_SEND);
2456    brw_set_dest(p, insn, dest);
2457    brw_set_src0(p, insn, src0);
2458    brw_set_src1(p, insn, brw_imm_d(0));
2459    brw_set_dp_write_message(p, insn,
2460                             binding_table_index,
2461                             0, /* msg_control: ignored */
2462                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2463                             1, /* msg_length */
2464                             true, /* header_present */
2465                             0, /* last_render_target: ignored */
2466                             send_commit_msg, /* response_length */
2467                             0, /* end_of_thread */
2468                             send_commit_msg); /* send_commit_msg */
2469 }
2470
2471 /**
2472  * This instruction is generated as a single-channel align1 instruction by
2473  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2474  *
2475  * We can't use the typed atomic op in the FS because that has the execution
2476  * mask ANDed with the pixel mask, but we just want to write the one dword for
2477  * all the pixels.
2478  *
2479  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2480  * one u32.  So we use the same untyped atomic write message as the pixel
2481  * shader.
2482  *
2483  * The untyped atomic operation requires a BUFFER surface type with RAW
2484  * format, and is only accessible through the legacy DATA_CACHE dataport
2485  * messages.
2486  */
2487 void brw_shader_time_add(struct brw_compile *p,
2488                          struct brw_reg payload,
2489                          uint32_t surf_index)
2490 {
2491    struct brw_context *brw = p->brw;
2492    assert(brw->gen >= 7);
2493
2494    brw_push_insn_state(p);
2495    brw_set_access_mode(p, BRW_ALIGN_1);
2496    brw_set_mask_control(p, BRW_MASK_DISABLE);
2497    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2498    brw_pop_insn_state(p);
2499
2500    /* We use brw_vec1_reg and unmasked because we want to increment the given
2501     * offset only once.
2502     */
2503    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2504                                       BRW_ARF_NULL, 0));
2505    brw_set_src0(p, send, brw_vec1_reg(payload.file,
2506                                       payload.nr, 0));
2507
2508    uint32_t sfid, msg_type;
2509    if (brw->is_haswell) {
2510       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2511       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2512    } else {
2513       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2514       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2515    }
2516
2517    bool header_present = false;
2518    bool eot = false;
2519    uint32_t mlen = 2; /* offset, value */
2520    uint32_t rlen = 0;
2521    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2522
2523    send->bits3.ud |= msg_type << 14;
2524    send->bits3.ud |= 0 << 13; /* no return data */
2525    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2526    send->bits3.ud |= BRW_AOP_ADD << 8;
2527    send->bits3.ud |= surf_index << 0;
2528 }