src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the BSpec / ISA Reference / send - [DevIVB+]:
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct intel_context *intel = &p->brw->intel;
  96    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 131           *    this to be programmed as "01".
 132           */
 133          insn->bits1.da16.dest_horiz_stride = 1;
 134       }
 135    }
 136    else {
 137       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 138
 139       /* These are different sizes in align1 vs align16:
 140        */
 141       if (insn->header.access_mode == BRW_ALIGN_1) {
 142          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 143          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 144             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 145          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 146       }
 147       else {
 148          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 149          /* even ignored in da16, still need to set as '01' */
 150          insn->bits1.ia16.dest_horiz_stride = 1;
 151       }
 152    }
 153
 154    /* NEW: Set the execution size based on dest.width and
 155     * insn->compression_control:
 156     */
 157    guess_execution_size(p, insn, dest);
 158 }
 159
 160 extern int reg_type_size[];
 161
 162 static void
 163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 164 {
 165    int hstride_for_reg[] = {0, 1, 2, 4};
 166    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 167    int width_for_reg[] = {1, 2, 4, 8, 16};
 168    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 169    int width, hstride, vstride, execsize;
 170
 171    if (reg.file == BRW_IMMEDIATE_VALUE) {
 172       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 173        * mean the destination has to be 128-bit aligned and the
 174        * destination horiz stride has to be a word.
 175        */
 176       if (reg.type == BRW_REGISTER_TYPE_V) {
 177          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 178                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 179       }
 180
 181       return;
 182    }
 183
 184    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 185        reg.file == BRW_ARF_NULL)
 186       return;
 187
 188    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 189    hstride = hstride_for_reg[reg.hstride];
 190
 191    if (reg.vstride == 0xf) {
 192       vstride = -1;
 193    } else {
 194       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 195       vstride = vstride_for_reg[reg.vstride];
 196    }
 197
 198    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 199    width = width_for_reg[reg.width];
 200
 201    assert(insn->header.execution_size >= 0 &&
 202           insn->header.execution_size < Elements(execsize_for_reg));
 203    execsize = execsize_for_reg[insn->header.execution_size];
 204
 205    /* Restrictions from 3.3.10: Register Region Restrictions. */
 206    /* 3. */
 207    assert(execsize >= width);
 208
 209    /* 4. */
 210    if (execsize == width && hstride != 0) {
 211       assert(vstride == -1 || vstride == width * hstride);
 212    }
 213
 214    /* 5. */
 215    if (execsize == width && hstride == 0) {
 216       /* no restriction on vstride. */
 217    }
 218
 219    /* 6. */
 220    if (width == 1) {
 221       assert(hstride == 0);
 222    }
 223
 224    /* 7. */
 225    if (execsize == 1 && width == 1) {
 226       assert(hstride == 0);
 227       assert(vstride == 0);
 228    }
 229
 230    /* 8. */
 231    if (vstride == 0 && hstride == 0) {
 232       assert(width == 1);
 233    }
 234
 235    /* 10. Check destination issues. */
 236 }
 237
 238 void
 239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 240              struct brw_reg reg)
 241 {
 242    struct brw_context *brw = p->brw;
 243    struct intel_context *intel = &brw->intel;
 244
 245    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 246       assert(reg.nr < 128);
 247
 248    gen7_convert_mrf_to_grf(p, &reg);
 249
 250    if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 251                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 252       /* Any source modifiers or regions will be ignored, since this just
 253        * identifies the MRF/GRF to start reading the message contents from.
 254        * Check for some likely failures.
 255        */
 256       assert(!reg.negate);
 257       assert(!reg.abs);
 258       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 259    }
 260
 261    validate_reg(insn, reg);
 262
 263    insn->bits1.da1.src0_reg_file = reg.file;
 264    insn->bits1.da1.src0_reg_type = reg.type;
 265    insn->bits2.da1.src0_abs = reg.abs;
 266    insn->bits2.da1.src0_negate = reg.negate;
 267    insn->bits2.da1.src0_address_mode = reg.address_mode;
 268
 269    if (reg.file == BRW_IMMEDIATE_VALUE) {
 270       insn->bits3.ud = reg.dw1.ud;
 271
 272       /* Required to set some fields in src1 as well:
 273        */
 274       insn->bits1.da1.src1_reg_file = 0; /* arf */
 275       insn->bits1.da1.src1_reg_type = reg.type;
 276    }
 277    else
 278    {
 279       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 280          if (insn->header.access_mode == BRW_ALIGN_1) {
 281             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 282             insn->bits2.da1.src0_reg_nr = reg.nr;
 283          }
 284          else {
 285             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 286             insn->bits2.da16.src0_reg_nr = reg.nr;
 287          }
 288       }
 289       else {
 290          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 291
 292          if (insn->header.access_mode == BRW_ALIGN_1) {
 293             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 294          }
 295          else {
 296             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 297          }
 298       }
 299
 300       if (insn->header.access_mode == BRW_ALIGN_1) {
 301          if (reg.width == BRW_WIDTH_1 &&
 302              insn->header.execution_size == BRW_EXECUTE_1) {
 303             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 304             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 305             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 306          }
 307          else {
 308             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 309             insn->bits2.da1.src0_width = reg.width;
 310             insn->bits2.da1.src0_vert_stride = reg.vstride;
 311          }
 312       }
 313       else {
 314          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 315          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 316          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 317          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 318
 319          /* This is an oddity of the fact we're using the same
 320           * descriptions for registers in align_16 as align_1:
 321           */
 322          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 323             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 324          else
 325             insn->bits2.da16.src0_vert_stride = reg.vstride;
 326       }
 327    }
 328 }
 329
 330
 331 void brw_set_src1(struct brw_compile *p,
 332                   struct brw_instruction *insn,
 333                   struct brw_reg reg)
 334 {
 335    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 336
 337    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 338       assert(reg.nr < 128);
 339
 340    gen7_convert_mrf_to_grf(p, &reg);
 341
 342    validate_reg(insn, reg);
 343
 344    insn->bits1.da1.src1_reg_file = reg.file;
 345    insn->bits1.da1.src1_reg_type = reg.type;
 346    insn->bits3.da1.src1_abs = reg.abs;
 347    insn->bits3.da1.src1_negate = reg.negate;
 348
 349    /* Only src1 can be immediate in two-argument instructions.
 350     */
 351    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 352
 353    if (reg.file == BRW_IMMEDIATE_VALUE) {
 354       insn->bits3.ud = reg.dw1.ud;
 355    }
 356    else {
 357       /* This is a hardware restriction, which may or may not be lifted
 358        * in the future:
 359        */
 360       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 361       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 362
 363       if (insn->header.access_mode == BRW_ALIGN_1) {
 364          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 365          insn->bits3.da1.src1_reg_nr = reg.nr;
 366       }
 367       else {
 368          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 369          insn->bits3.da16.src1_reg_nr = reg.nr;
 370       }
 371
 372       if (insn->header.access_mode == BRW_ALIGN_1) {
 373          if (reg.width == BRW_WIDTH_1 &&
 374              insn->header.execution_size == BRW_EXECUTE_1) {
 375             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 376             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 377             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 378          }
 379          else {
 380             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 381             insn->bits3.da1.src1_width = reg.width;
 382             insn->bits3.da1.src1_vert_stride = reg.vstride;
 383          }
 384       }
 385       else {
 386          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 387          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 388          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 389          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 390
 391          /* This is an oddity of the fact we're using the same
 392           * descriptions for registers in align_16 as align_1:
 393           */
 394          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 395             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 396          else
 397             insn->bits3.da16.src1_vert_stride = reg.vstride;
 398       }
 399    }
 400 }
 401
 402 /**
 403  * Set the Message Descriptor and Extended Message Descriptor fields
 404  * for SEND messages.
 405  *
 406  * \note This zeroes out the Function Control bits, so it must be called
 407  *       \b before filling out any message-specific data.  Callers can
 408  *       choose not to fill in irrelevant bits; they will be zero.
 409  */
 410 static void
 411 brw_set_message_descriptor(struct brw_compile *p,
 412                            struct brw_instruction *inst,
 413                            enum brw_message_target sfid,
 414                            unsigned msg_length,
 415                            unsigned response_length,
 416                            bool header_present,
 417                            bool end_of_thread)
 418 {
 419    struct intel_context *intel = &p->brw->intel;
 420
 421    brw_set_src1(p, inst, brw_imm_d(0));
 422
 423    if (intel->gen >= 5) {
 424       inst->bits3.generic_gen5.header_present = header_present;
 425       inst->bits3.generic_gen5.response_length = response_length;
 426       inst->bits3.generic_gen5.msg_length = msg_length;
 427       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 428
 429       if (intel->gen >= 6) {
 430          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 431          inst->header.destreg__conditionalmod = sfid;
 432       } else {
 433          /* Set Extended Message Descriptor (ex_desc) */
 434          inst->bits2.send_gen5.sfid = sfid;
 435          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 436       }
 437    } else {
 438       inst->bits3.generic.response_length = response_length;
 439       inst->bits3.generic.msg_length = msg_length;
 440       inst->bits3.generic.msg_target = sfid;
 441       inst->bits3.generic.end_of_thread = end_of_thread;
 442    }
 443 }
 444
 445 static void brw_set_math_message( struct brw_compile *p,
 446                                   struct brw_instruction *insn,
 447                                   GLuint function,
 448                                   GLuint integer_type,
 449                                   bool low_precision,
 450                                   GLuint dataType )
 451 {
 452    struct brw_context *brw = p->brw;
 453    struct intel_context *intel = &brw->intel;
 454    unsigned msg_length;
 455    unsigned response_length;
 456
 457    /* Infer message length from the function */
 458    switch (function) {
 459    case BRW_MATH_FUNCTION_POW:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 461    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 462    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 463       msg_length = 2;
 464       break;
 465    default:
 466       msg_length = 1;
 467       break;
 468    }
 469
 470    /* Infer response length from the function */
 471    switch (function) {
 472    case BRW_MATH_FUNCTION_SINCOS:
 473    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 474       response_length = 2;
 475       break;
 476    default:
 477       response_length = 1;
 478       break;
 479    }
 480
 481
 482    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 483                               msg_length, response_length, false, false);
 484    if (intel->gen == 5) {
 485       insn->bits3.math_gen5.function = function;
 486       insn->bits3.math_gen5.int_type = integer_type;
 487       insn->bits3.math_gen5.precision = low_precision;
 488       insn->bits3.math_gen5.saturate = insn->header.saturate;
 489       insn->bits3.math_gen5.data_type = dataType;
 490       insn->bits3.math_gen5.snapshot = 0;
 491    } else {
 492       insn->bits3.math.function = function;
 493       insn->bits3.math.int_type = integer_type;
 494       insn->bits3.math.precision = low_precision;
 495       insn->bits3.math.saturate = insn->header.saturate;
 496       insn->bits3.math.data_type = dataType;
 497    }
 498    insn->header.saturate = 0;
 499 }
 500
 501
 502 static void brw_set_ff_sync_message(struct brw_compile *p,
 503                                     struct brw_instruction *insn,
 504                                     bool allocate,
 505                                     GLuint response_length,
 506                                     bool end_of_thread)
 507 {
 508    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 509                               1, response_length, true, end_of_thread);
 510    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 511    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 512    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.allocate = allocate;
 514    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 515    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 516 }
 517
 518 static void brw_set_urb_message( struct brw_compile *p,
 519                                  struct brw_instruction *insn,
 520                                  bool allocate,
 521                                  bool used,
 522                                  GLuint msg_length,
 523                                  GLuint response_length,
 524                                  bool end_of_thread,
 525                                  bool complete,
 526                                  GLuint offset,
 527                                  GLuint swizzle_control )
 528 {
 529    struct brw_context *brw = p->brw;
 530    struct intel_context *intel = &brw->intel;
 531
 532    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 533                               msg_length, response_length, true, end_of_thread);
 534    if (intel->gen == 7) {
 535       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 536       insn->bits3.urb_gen7.offset = offset;
 537       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 538       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 539       /* per_slot_offset = 0 makes it ignore offsets in message header */
 540       insn->bits3.urb_gen7.per_slot_offset = 0;
 541       insn->bits3.urb_gen7.complete = complete;
 542    } else if (intel->gen >= 5) {
 543       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 544       insn->bits3.urb_gen5.offset = offset;
 545       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 546       insn->bits3.urb_gen5.allocate = allocate;
 547       insn->bits3.urb_gen5.used = used; /* ? */
 548       insn->bits3.urb_gen5.complete = complete;
 549    } else {
 550       insn->bits3.urb.opcode = 0;       /* ? */
 551       insn->bits3.urb.offset = offset;
 552       insn->bits3.urb.swizzle_control = swizzle_control;
 553       insn->bits3.urb.allocate = allocate;
 554       insn->bits3.urb.used = used;      /* ? */
 555       insn->bits3.urb.complete = complete;
 556    }
 557 }
 558
 559 void
 560 brw_set_dp_write_message(struct brw_compile *p,
 561                          struct brw_instruction *insn,
 562                          GLuint binding_table_index,
 563                          GLuint msg_control,
 564                          GLuint msg_type,
 565                          GLuint msg_length,
 566                          bool header_present,
 567                          GLuint last_render_target,
 568                          GLuint response_length,
 569                          GLuint end_of_thread,
 570                          GLuint send_commit_msg)
 571 {
 572    struct brw_context *brw = p->brw;
 573    struct intel_context *intel = &brw->intel;
 574    unsigned sfid;
 575
 576    if (intel->gen >= 7) {
 577       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 578       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 579          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 580       else
 581          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 582    } else if (intel->gen == 6) {
 583       /* Use the render cache for all write messages. */
 584       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 585    } else {
 586       sfid = BRW_SFID_DATAPORT_WRITE;
 587    }
 588
 589    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 590                               header_present, end_of_thread);
 591
 592    if (intel->gen >= 7) {
 593       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 594       insn->bits3.gen7_dp.msg_control = msg_control;
 595       insn->bits3.gen7_dp.last_render_target = last_render_target;
 596       insn->bits3.gen7_dp.msg_type = msg_type;
 597    } else if (intel->gen == 6) {
 598       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 599       insn->bits3.gen6_dp.msg_control = msg_control;
 600       insn->bits3.gen6_dp.last_render_target = last_render_target;
 601       insn->bits3.gen6_dp.msg_type = msg_type;
 602       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 603    } else if (intel->gen == 5) {
 604       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 605       insn->bits3.dp_write_gen5.msg_control = msg_control;
 606       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 607       insn->bits3.dp_write_gen5.msg_type = msg_type;
 608       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 609    } else {
 610       insn->bits3.dp_write.binding_table_index = binding_table_index;
 611       insn->bits3.dp_write.msg_control = msg_control;
 612       insn->bits3.dp_write.last_render_target = last_render_target;
 613       insn->bits3.dp_write.msg_type = msg_type;
 614       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 615    }
 616 }
 617
 618 void
 619 brw_set_dp_read_message(struct brw_compile *p,
 620                         struct brw_instruction *insn,
 621                         GLuint binding_table_index,
 622                         GLuint msg_control,
 623                         GLuint msg_type,
 624                         GLuint target_cache,
 625                         GLuint msg_length,
 626                         bool header_present,
 627                         GLuint response_length)
 628 {
 629    struct brw_context *brw = p->brw;
 630    struct intel_context *intel = &brw->intel;
 631    unsigned sfid;
 632
 633    if (intel->gen >= 7) {
 634       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 635    } else if (intel->gen == 6) {
 636       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 637          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 638       else
 639          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 640    } else {
 641       sfid = BRW_SFID_DATAPORT_READ;
 642    }
 643
 644    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 645                               header_present, false);
 646
 647    if (intel->gen >= 7) {
 648       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 649       insn->bits3.gen7_dp.msg_control = msg_control;
 650       insn->bits3.gen7_dp.last_render_target = 0;
 651       insn->bits3.gen7_dp.msg_type = msg_type;
 652    } else if (intel->gen == 6) {
 653       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 654       insn->bits3.gen6_dp.msg_control = msg_control;
 655       insn->bits3.gen6_dp.last_render_target = 0;
 656       insn->bits3.gen6_dp.msg_type = msg_type;
 657       insn->bits3.gen6_dp.send_commit_msg = 0;
 658    } else if (intel->gen == 5) {
 659       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 660       insn->bits3.dp_read_gen5.msg_control = msg_control;
 661       insn->bits3.dp_read_gen5.msg_type = msg_type;
 662       insn->bits3.dp_read_gen5.target_cache = target_cache;
 663    } else if (intel->is_g4x) {
 664       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 665       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 666       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 667       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 668    } else {
 669       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 670       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 671       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 672       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 673    }
 674 }
 675
 676 void
 677 brw_set_sampler_message(struct brw_compile *p,
 678                         struct brw_instruction *insn,
 679                         GLuint binding_table_index,
 680                         GLuint sampler,
 681                         GLuint msg_type,
 682                         GLuint response_length,
 683                         GLuint msg_length,
 684                         GLuint header_present,
 685                         GLuint simd_mode,
 686                         GLuint return_format)
 687 {
 688    struct brw_context *brw = p->brw;
 689    struct intel_context *intel = &brw->intel;
 690
 691    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 692                               response_length, header_present, false);
 693
 694    if (intel->gen >= 7) {
 695       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 696       insn->bits3.sampler_gen7.sampler = sampler;
 697       insn->bits3.sampler_gen7.msg_type = msg_type;
 698       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 699    } else if (intel->gen >= 5) {
 700       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 701       insn->bits3.sampler_gen5.sampler = sampler;
 702       insn->bits3.sampler_gen5.msg_type = msg_type;
 703       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 704    } else if (intel->is_g4x) {
 705       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 706       insn->bits3.sampler_g4x.sampler = sampler;
 707       insn->bits3.sampler_g4x.msg_type = msg_type;
 708    } else {
 709       insn->bits3.sampler.binding_table_index = binding_table_index;
 710       insn->bits3.sampler.sampler = sampler;
 711       insn->bits3.sampler.msg_type = msg_type;
 712       insn->bits3.sampler.return_format = return_format;
 713    }
 714 }
 715
 716
 717 #define next_insn brw_next_insn
 718 struct brw_instruction *
 719 brw_next_insn(struct brw_compile *p, GLuint opcode)
 720 {
 721    struct brw_instruction *insn;
 722
 723    if (p->nr_insn + 1 > p->store_size) {
 724       if (0)
 725          printf("incresing the store size to %d\n", p->store_size << 1);
 726       p->store_size <<= 1;
 727       p->store = reralloc(p->mem_ctx, p->store,
 728                           struct brw_instruction, p->store_size);
 729       if (!p->store)
 730          assert(!"realloc eu store memeory failed");
 731    }
 732
 733    p->next_insn_offset += 16;
 734    insn = &p->store[p->nr_insn++];
 735    memcpy(insn, p->current, sizeof(*insn));
 736
 737    /* Reset this one-shot flag:
 738     */
 739
 740    if (p->current->header.destreg__conditionalmod) {
 741       p->current->header.destreg__conditionalmod = 0;
 742       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 743    }
 744
 745    insn->header.opcode = opcode;
 746    return insn;
 747 }
 748
 749 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 750                                          GLuint opcode,
 751                                          struct brw_reg dest,
 752                                          struct brw_reg src )
 753 {
 754    struct brw_instruction *insn = next_insn(p, opcode);
 755    brw_set_dest(p, insn, dest);
 756    brw_set_src0(p, insn, src);
 757    return insn;
 758 }
 759
 760 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 761                                         GLuint opcode,
 762                                         struct brw_reg dest,
 763                                         struct brw_reg src0,
 764                                         struct brw_reg src1 )
 765 {
 766    struct brw_instruction *insn = next_insn(p, opcode);
 767    brw_set_dest(p, insn, dest);
 768    brw_set_src0(p, insn, src0);
 769    brw_set_src1(p, insn, src1);
 770    return insn;
 771 }
 772
 773 static int
 774 get_3src_subreg_nr(struct brw_reg reg)
 775 {
 776    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 777       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 778       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 779    } else {
 780       return reg.subnr / 4;
 781    }
 782 }
 783
 784 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 785                                         GLuint opcode,
 786                                         struct brw_reg dest,
 787                                         struct brw_reg src0,
 788                                         struct brw_reg src1,
 789                                         struct brw_reg src2)
 790 {
 791    struct brw_instruction *insn = next_insn(p, opcode);
 792
 793    gen7_convert_mrf_to_grf(p, &dest);
 794
 795    assert(insn->header.access_mode == BRW_ALIGN_16);
 796
 797    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 798           dest.file == BRW_MESSAGE_REGISTER_FILE);
 799    assert(dest.nr < 128);
 800    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 801    assert(dest.type == BRW_REGISTER_TYPE_F);
 802    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 803    insn->bits1.da3src.dest_reg_nr = dest.nr;
 804    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 805    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 806    guess_execution_size(p, insn, dest);
 807
 808    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 809    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 810    assert(src0.nr < 128);
 811    assert(src0.type == BRW_REGISTER_TYPE_F);
 812    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 813    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 814    insn->bits2.da3src.src0_reg_nr = src0.nr;
 815    insn->bits1.da3src.src0_abs = src0.abs;
 816    insn->bits1.da3src.src0_negate = src0.negate;
 817    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 818
 819    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 820    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 821    assert(src1.nr < 128);
 822    assert(src1.type == BRW_REGISTER_TYPE_F);
 823    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 824    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 825    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 826    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 827    insn->bits3.da3src.src1_reg_nr = src1.nr;
 828    insn->bits1.da3src.src1_abs = src1.abs;
 829    insn->bits1.da3src.src1_negate = src1.negate;
 830
 831    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 832    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 833    assert(src2.nr < 128);
 834    assert(src2.type == BRW_REGISTER_TYPE_F);
 835    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 836    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 837    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 838    insn->bits3.da3src.src2_reg_nr = src2.nr;
 839    insn->bits1.da3src.src2_abs = src2.abs;
 840    insn->bits1.da3src.src2_negate = src2.negate;
 841
 842    return insn;
 843 }
 844
 845
 846 /***********************************************************************
 847  * Convenience routines.
 848  */
 849 #define ALU1(OP)                                        \
 850 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 851               struct brw_reg dest,                      \
 852               struct brw_reg src0)                      \
 853 {                                                       \
 854    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 855 }
 856
 857 #define ALU2(OP)                                        \
 858 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 859               struct brw_reg dest,                      \
 860               struct brw_reg src0,                      \
 861               struct brw_reg src1)                      \
 862 {                                                       \
 863    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 864 }
 865
 866 #define ALU3(OP)                                        \
 867 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 868               struct brw_reg dest,                      \
 869               struct brw_reg src0,                      \
 870               struct brw_reg src1,                      \
 871               struct brw_reg src2)                      \
 872 {                                                       \
 873    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 874 }
 875
 876 /* Rounding operations (other than RNDD) require two instructions - the first
 877  * stores a rounded value (possibly the wrong way) in the dest register, but
 878  * also sets a per-channel "increment bit" in the flag register.  A predicated
 879  * add of 1.0 fixes dest to contain the desired result.
 880  *
 881  * Sandybridge and later appear to round correctly without an ADD.
 882  */
 883 #define ROUND(OP)                                                             \
 884 void brw_##OP(struct brw_compile *p,                                          \
 885               struct brw_reg dest,                                            \
 886               struct brw_reg src)                                             \
 887 {                                                                             \
 888    struct brw_instruction *rnd, *add;                                         \
 889    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 890    brw_set_dest(p, rnd, dest);                                                \
 891    brw_set_src0(p, rnd, src);                                                 \
 892                                                                               \
 893    if (p->brw->intel.gen < 6) {                                               \
 894       /* turn on round-increments */                                          \
 895       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 896       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 897       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 898    }                                                                          \
 899 }
 900
 901
 902 ALU1(MOV)
 903 ALU2(SEL)
 904 ALU1(NOT)
 905 ALU2(AND)
 906 ALU2(OR)
 907 ALU2(XOR)
 908 ALU2(SHR)
 909 ALU2(SHL)
 910 ALU2(RSR)
 911 ALU2(RSL)
 912 ALU2(ASR)
 913 ALU1(F32TO16)
 914 ALU1(F16TO32)
 915 ALU1(FRC)
 916 ALU1(RNDD)
 917 ALU2(MAC)
 918 ALU2(MACH)
 919 ALU1(LZD)
 920 ALU2(DP4)
 921 ALU2(DPH)
 922 ALU2(DP3)
 923 ALU2(DP2)
 924 ALU2(LINE)
 925 ALU2(PLN)
 926 ALU3(MAD)
 927 ALU3(LRP)
 928
 929 ROUND(RNDZ)
 930 ROUND(RNDE)
 931
 932
 933 struct brw_instruction *brw_ADD(struct brw_compile *p,
 934                                 struct brw_reg dest,
 935                                 struct brw_reg src0,
 936                                 struct brw_reg src1)
 937 {
 938    /* 6.2.2: add */
 939    if (src0.type == BRW_REGISTER_TYPE_F ||
 940        (src0.file == BRW_IMMEDIATE_VALUE &&
 941         src0.type == BRW_REGISTER_TYPE_VF)) {
 942       assert(src1.type != BRW_REGISTER_TYPE_UD);
 943       assert(src1.type != BRW_REGISTER_TYPE_D);
 944    }
 945
 946    if (src1.type == BRW_REGISTER_TYPE_F ||
 947        (src1.file == BRW_IMMEDIATE_VALUE &&
 948         src1.type == BRW_REGISTER_TYPE_VF)) {
 949       assert(src0.type != BRW_REGISTER_TYPE_UD);
 950       assert(src0.type != BRW_REGISTER_TYPE_D);
 951    }
 952
 953    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 954 }
 955
 956 struct brw_instruction *brw_AVG(struct brw_compile *p,
 957                                 struct brw_reg dest,
 958                                 struct brw_reg src0,
 959                                 struct brw_reg src1)
 960 {
 961    assert(dest.type == src0.type);
 962    assert(src0.type == src1.type);
 963    switch (src0.type) {
 964    case BRW_REGISTER_TYPE_B:
 965    case BRW_REGISTER_TYPE_UB:
 966    case BRW_REGISTER_TYPE_W:
 967    case BRW_REGISTER_TYPE_UW:
 968    case BRW_REGISTER_TYPE_D:
 969    case BRW_REGISTER_TYPE_UD:
 970       break;
 971    default:
 972       assert(!"Bad type for brw_AVG");
 973    }
 974
 975    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
 976 }
 977
 978 struct brw_instruction *brw_MUL(struct brw_compile *p,
 979                                 struct brw_reg dest,
 980                                 struct brw_reg src0,
 981                                 struct brw_reg src1)
 982 {
 983    /* 6.32.38: mul */
 984    if (src0.type == BRW_REGISTER_TYPE_D ||
 985        src0.type == BRW_REGISTER_TYPE_UD ||
 986        src1.type == BRW_REGISTER_TYPE_D ||
 987        src1.type == BRW_REGISTER_TYPE_UD) {
 988       assert(dest.type != BRW_REGISTER_TYPE_F);
 989    }
 990
 991    if (src0.type == BRW_REGISTER_TYPE_F ||
 992        (src0.file == BRW_IMMEDIATE_VALUE &&
 993         src0.type == BRW_REGISTER_TYPE_VF)) {
 994       assert(src1.type != BRW_REGISTER_TYPE_UD);
 995       assert(src1.type != BRW_REGISTER_TYPE_D);
 996    }
 997
 998    if (src1.type == BRW_REGISTER_TYPE_F ||
 999        (src1.file == BRW_IMMEDIATE_VALUE &&
1000         src1.type == BRW_REGISTER_TYPE_VF)) {
1001       assert(src0.type != BRW_REGISTER_TYPE_UD);
1002       assert(src0.type != BRW_REGISTER_TYPE_D);
1003    }
1004
1005    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1006           src0.nr != BRW_ARF_ACCUMULATOR);
1007    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1008           src1.nr != BRW_ARF_ACCUMULATOR);
1009
1010    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1011 }
1012
1013
1014 void brw_NOP(struct brw_compile *p)
1015 {
1016    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1017    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1018    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1019    brw_set_src1(p, insn, brw_imm_ud(0x0));
1020 }
1021
1022
1023
1024
1025
1026 /***********************************************************************
1027  * Comparisons, if/else/endif
1028  */
1029
1030 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1031                                  struct brw_reg dest,
1032                                  struct brw_reg src0,
1033                                  struct brw_reg src1)
1034 {
1035    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1036
1037    insn->header.execution_size = 1;
1038    insn->header.compression_control = BRW_COMPRESSION_NONE;
1039    insn->header.mask_control = BRW_MASK_DISABLE;
1040
1041    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1042
1043    return insn;
1044 }
1045
1046 static void
1047 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1048 {
1049    p->if_stack[p->if_stack_depth] = inst - p->store;
1050
1051    p->if_stack_depth++;
1052    if (p->if_stack_array_size <= p->if_stack_depth) {
1053       p->if_stack_array_size *= 2;
1054       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1055                              p->if_stack_array_size);
1056    }
1057 }
1058
1059 static struct brw_instruction *
1060 pop_if_stack(struct brw_compile *p)
1061 {
1062    p->if_stack_depth--;
1063    return &p->store[p->if_stack[p->if_stack_depth]];
1064 }
1065
1066 static void
1067 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1068 {
1069    if (p->loop_stack_array_size < p->loop_stack_depth) {
1070       p->loop_stack_array_size *= 2;
1071       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1072                                p->loop_stack_array_size);
1073       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1074                                      p->loop_stack_array_size);
1075    }
1076
1077    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1078    p->loop_stack_depth++;
1079    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1080 }
1081
1082 static struct brw_instruction *
1083 get_inner_do_insn(struct brw_compile *p)
1084 {
1085    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1086 }
1087
1088 /* EU takes the value from the flag register and pushes it onto some
1089  * sort of a stack (presumably merging with any flag value already on
1090  * the stack).  Within an if block, the flags at the top of the stack
1091  * control execution on each channel of the unit, eg. on each of the
1092  * 16 pixel values in our wm programs.
1093  *
1094  * When the matching 'else' instruction is reached (presumably by
1095  * countdown of the instruction count patched in by our ELSE/ENDIF
1096  * functions), the relevent flags are inverted.
1097  *
1098  * When the matching 'endif' instruction is reached, the flags are
1099  * popped off.  If the stack is now empty, normal execution resumes.
1100  */
1101 struct brw_instruction *
1102 brw_IF(struct brw_compile *p, GLuint execute_size)
1103 {
1104    struct intel_context *intel = &p->brw->intel;
1105    struct brw_instruction *insn;
1106
1107    insn = next_insn(p, BRW_OPCODE_IF);
1108
1109    /* Override the defaults for this instruction:
1110     */
1111    if (intel->gen < 6) {
1112       brw_set_dest(p, insn, brw_ip_reg());
1113       brw_set_src0(p, insn, brw_ip_reg());
1114       brw_set_src1(p, insn, brw_imm_d(0x0));
1115    } else if (intel->gen == 6) {
1116       brw_set_dest(p, insn, brw_imm_w(0));
1117       insn->bits1.branch_gen6.jump_count = 0;
1118       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1119       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1120    } else {
1121       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1122       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1123       brw_set_src1(p, insn, brw_imm_ud(0));
1124       insn->bits3.break_cont.jip = 0;
1125       insn->bits3.break_cont.uip = 0;
1126    }
1127
1128    insn->header.execution_size = execute_size;
1129    insn->header.compression_control = BRW_COMPRESSION_NONE;
1130    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1131    insn->header.mask_control = BRW_MASK_ENABLE;
1132    if (!p->single_program_flow)
1133       insn->header.thread_control = BRW_THREAD_SWITCH;
1134
1135    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1136
1137    push_if_stack(p, insn);
1138    p->if_depth_in_loop[p->loop_stack_depth]++;
1139    return insn;
1140 }
1141
1142 /* This function is only used for gen6-style IF instructions with an
1143  * embedded comparison (conditional modifier).  It is not used on gen7.
1144  */
1145 struct brw_instruction *
1146 gen6_IF(struct brw_compile *p, uint32_t conditional,
1147         struct brw_reg src0, struct brw_reg src1)
1148 {
1149    struct brw_instruction *insn;
1150
1151    insn = next_insn(p, BRW_OPCODE_IF);
1152
1153    brw_set_dest(p, insn, brw_imm_w(0));
1154    if (p->compressed) {
1155       insn->header.execution_size = BRW_EXECUTE_16;
1156    } else {
1157       insn->header.execution_size = BRW_EXECUTE_8;
1158    }
1159    insn->bits1.branch_gen6.jump_count = 0;
1160    brw_set_src0(p, insn, src0);
1161    brw_set_src1(p, insn, src1);
1162
1163    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1164    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1165    insn->header.destreg__conditionalmod = conditional;
1166
1167    if (!p->single_program_flow)
1168       insn->header.thread_control = BRW_THREAD_SWITCH;
1169
1170    push_if_stack(p, insn);
1171    return insn;
1172 }
1173
1174 /**
1175  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1176  */
1177 static void
1178 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1179                        struct brw_instruction *if_inst,
1180                        struct brw_instruction *else_inst)
1181 {
1182    /* The next instruction (where the ENDIF would be, if it existed) */
1183    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1184
1185    assert(p->single_program_flow);
1186    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1187    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1188    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1189
1190    /* Convert IF to an ADD instruction that moves the instruction pointer
1191     * to the first instruction of the ELSE block.  If there is no ELSE
1192     * block, point to where ENDIF would be.  Reverse the predicate.
1193     *
1194     * There's no need to execute an ENDIF since we don't need to do any
1195     * stack operations, and if we're currently executing, we just want to
1196     * continue normally.
1197     */
1198    if_inst->header.opcode = BRW_OPCODE_ADD;
1199    if_inst->header.predicate_inverse = 1;
1200
1201    if (else_inst != NULL) {
1202       /* Convert ELSE to an ADD instruction that points where the ENDIF
1203        * would be.
1204        */
1205       else_inst->header.opcode = BRW_OPCODE_ADD;
1206
1207       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1208       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1209    } else {
1210       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1211    }
1212 }
1213
1214 /**
1215  * Patch IF and ELSE instructions with appropriate jump targets.
1216  */
1217 static void
1218 patch_IF_ELSE(struct brw_compile *p,
1219               struct brw_instruction *if_inst,
1220               struct brw_instruction *else_inst,
1221               struct brw_instruction *endif_inst)
1222 {
1223    struct intel_context *intel = &p->brw->intel;
1224
1225    /* We shouldn't be patching IF and ELSE instructions in single program flow
1226     * mode when gen < 6, because in single program flow mode on those
1227     * platforms, we convert flow control instructions to conditional ADDs that
1228     * operate on IP (see brw_ENDIF).
1229     *
1230     * However, on Gen6, writing to IP doesn't work in single program flow mode
1231     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1232     * not be updated by non-flow control instructions.").  And on later
1233     * platforms, there is no significant benefit to converting control flow
1234     * instructions to conditional ADDs.  So we do patch IF and ELSE
1235     * instructions in single program flow mode on those platforms.
1236     */
1237    if (intel->gen < 6)
1238       assert(!p->single_program_flow);
1239
1240    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1241    assert(endif_inst != NULL);
1242    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1243
1244    unsigned br = 1;
1245    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1246     * requires 2 chunks.
1247     */
1248    if (intel->gen >= 5)
1249       br = 2;
1250
1251    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1252    endif_inst->header.execution_size = if_inst->header.execution_size;
1253
1254    if (else_inst == NULL) {
1255       /* Patch IF -> ENDIF */
1256       if (intel->gen < 6) {
1257          /* Turn it into an IFF, which means no mask stack operations for
1258           * all-false and jumping past the ENDIF.
1259           */
1260          if_inst->header.opcode = BRW_OPCODE_IFF;
1261          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1262          if_inst->bits3.if_else.pop_count = 0;
1263          if_inst->bits3.if_else.pad0 = 0;
1264       } else if (intel->gen == 6) {
1265          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1266          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1267       } else {
1268          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1269          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1270       }
1271    } else {
1272       else_inst->header.execution_size = if_inst->header.execution_size;
1273
1274       /* Patch IF -> ELSE */
1275       if (intel->gen < 6) {
1276          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1277          if_inst->bits3.if_else.pop_count = 0;
1278          if_inst->bits3.if_else.pad0 = 0;
1279       } else if (intel->gen == 6) {
1280          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1281       }
1282
1283       /* Patch ELSE -> ENDIF */
1284       if (intel->gen < 6) {
1285          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1286           * matching ENDIF.
1287           */
1288          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1289          else_inst->bits3.if_else.pop_count = 1;
1290          else_inst->bits3.if_else.pad0 = 0;
1291       } else if (intel->gen == 6) {
1292          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1293          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1294       } else {
1295          /* The IF instruction's JIP should point just past the ELSE */
1296          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1297          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1298          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1299          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1300       }
1301    }
1302 }
1303
1304 void
1305 brw_ELSE(struct brw_compile *p)
1306 {
1307    struct intel_context *intel = &p->brw->intel;
1308    struct brw_instruction *insn;
1309
1310    insn = next_insn(p, BRW_OPCODE_ELSE);
1311
1312    if (intel->gen < 6) {
1313       brw_set_dest(p, insn, brw_ip_reg());
1314       brw_set_src0(p, insn, brw_ip_reg());
1315       brw_set_src1(p, insn, brw_imm_d(0x0));
1316    } else if (intel->gen == 6) {
1317       brw_set_dest(p, insn, brw_imm_w(0));
1318       insn->bits1.branch_gen6.jump_count = 0;
1319       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1320       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1321    } else {
1322       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1323       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1324       brw_set_src1(p, insn, brw_imm_ud(0));
1325       insn->bits3.break_cont.jip = 0;
1326       insn->bits3.break_cont.uip = 0;
1327    }
1328
1329    insn->header.compression_control = BRW_COMPRESSION_NONE;
1330    insn->header.mask_control = BRW_MASK_ENABLE;
1331    if (!p->single_program_flow)
1332       insn->header.thread_control = BRW_THREAD_SWITCH;
1333
1334    push_if_stack(p, insn);
1335 }
1336
1337 void
1338 brw_ENDIF(struct brw_compile *p)
1339 {
1340    struct intel_context *intel = &p->brw->intel;
1341    struct brw_instruction *insn = NULL;
1342    struct brw_instruction *else_inst = NULL;
1343    struct brw_instruction *if_inst = NULL;
1344    struct brw_instruction *tmp;
1345    bool emit_endif = true;
1346
1347    /* In single program flow mode, we can express IF and ELSE instructions
1348     * equivalently as ADD instructions that operate on IP.  On platforms prior
1349     * to Gen6, flow control instructions cause an implied thread switch, so
1350     * this is a significant savings.
1351     *
1352     * However, on Gen6, writing to IP doesn't work in single program flow mode
1353     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1354     * not be updated by non-flow control instructions.").  And on later
1355     * platforms, there is no significant benefit to converting control flow
1356     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1357     * Gen5.
1358     */
1359    if (intel->gen < 6 && p->single_program_flow)
1360       emit_endif = false;
1361
1362    /*
1363     * A single next_insn() may change the base adress of instruction store
1364     * memory(p->store), so call it first before referencing the instruction
1365     * store pointer from an index
1366     */
1367    if (emit_endif)
1368       insn = next_insn(p, BRW_OPCODE_ENDIF);
1369
1370    /* Pop the IF and (optional) ELSE instructions from the stack */
1371    p->if_depth_in_loop[p->loop_stack_depth]--;
1372    tmp = pop_if_stack(p);
1373    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1374       else_inst = tmp;
1375       tmp = pop_if_stack(p);
1376    }
1377    if_inst = tmp;
1378
1379    if (!emit_endif) {
1380       /* ENDIF is useless; don't bother emitting it. */
1381       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1382       return;
1383    }
1384
1385    if (intel->gen < 6) {
1386       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1387       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1388       brw_set_src1(p, insn, brw_imm_d(0x0));
1389    } else if (intel->gen == 6) {
1390       brw_set_dest(p, insn, brw_imm_w(0));
1391       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1393    } else {
1394       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1396       brw_set_src1(p, insn, brw_imm_ud(0));
1397    }
1398
1399    insn->header.compression_control = BRW_COMPRESSION_NONE;
1400    insn->header.mask_control = BRW_MASK_ENABLE;
1401    insn->header.thread_control = BRW_THREAD_SWITCH;
1402
1403    /* Also pop item off the stack in the endif instruction: */
1404    if (intel->gen < 6) {
1405       insn->bits3.if_else.jump_count = 0;
1406       insn->bits3.if_else.pop_count = 1;
1407       insn->bits3.if_else.pad0 = 0;
1408    } else if (intel->gen == 6) {
1409       insn->bits1.branch_gen6.jump_count = 2;
1410    } else {
1411       insn->bits3.break_cont.jip = 2;
1412    }
1413    patch_IF_ELSE(p, if_inst, else_inst, insn);
1414 }
1415
1416 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1417 {
1418    struct intel_context *intel = &p->brw->intel;
1419    struct brw_instruction *insn;
1420
1421    insn = next_insn(p, BRW_OPCODE_BREAK);
1422    if (intel->gen >= 6) {
1423       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1424       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1425       brw_set_src1(p, insn, brw_imm_d(0x0));
1426    } else {
1427       brw_set_dest(p, insn, brw_ip_reg());
1428       brw_set_src0(p, insn, brw_ip_reg());
1429       brw_set_src1(p, insn, brw_imm_d(0x0));
1430       insn->bits3.if_else.pad0 = 0;
1431       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1432    }
1433    insn->header.compression_control = BRW_COMPRESSION_NONE;
1434    insn->header.execution_size = BRW_EXECUTE_8;
1435
1436    return insn;
1437 }
1438
1439 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1440 {
1441    struct brw_instruction *insn;
1442
1443    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1444    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1445    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1446    brw_set_dest(p, insn, brw_ip_reg());
1447    brw_set_src0(p, insn, brw_ip_reg());
1448    brw_set_src1(p, insn, brw_imm_d(0x0));
1449
1450    insn->header.compression_control = BRW_COMPRESSION_NONE;
1451    insn->header.execution_size = BRW_EXECUTE_8;
1452    return insn;
1453 }
1454
1455 struct brw_instruction *brw_CONT(struct brw_compile *p)
1456 {
1457    struct brw_instruction *insn;
1458    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1459    brw_set_dest(p, insn, brw_ip_reg());
1460    brw_set_src0(p, insn, brw_ip_reg());
1461    brw_set_src1(p, insn, brw_imm_d(0x0));
1462    insn->header.compression_control = BRW_COMPRESSION_NONE;
1463    insn->header.execution_size = BRW_EXECUTE_8;
1464    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1465    insn->bits3.if_else.pad0 = 0;
1466    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1467    return insn;
1468 }
1469
1470 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1471 {
1472    struct brw_instruction *insn;
1473
1474    insn = next_insn(p, BRW_OPCODE_HALT);
1475    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1476    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1477    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1478
1479    if (p->compressed) {
1480       insn->header.execution_size = BRW_EXECUTE_16;
1481    } else {
1482       insn->header.compression_control = BRW_COMPRESSION_NONE;
1483       insn->header.execution_size = BRW_EXECUTE_8;
1484    }
1485    return insn;
1486 }
1487
1488 /* DO/WHILE loop:
1489  *
1490  * The DO/WHILE is just an unterminated loop -- break or continue are
1491  * used for control within the loop.  We have a few ways they can be
1492  * done.
1493  *
1494  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1495  * jip and no DO instruction.
1496  *
1497  * For non-uniform control flow pre-gen6, there's a DO instruction to
1498  * push the mask, and a WHILE to jump back, and BREAK to get out and
1499  * pop the mask.
1500  *
1501  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1502  * just points back to the first instruction of the loop.
1503  */
1504 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1505 {
1506    struct intel_context *intel = &p->brw->intel;
1507
1508    if (intel->gen >= 6 || p->single_program_flow) {
1509       push_loop_stack(p, &p->store[p->nr_insn]);
1510       return &p->store[p->nr_insn];
1511    } else {
1512       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1513
1514       push_loop_stack(p, insn);
1515
1516       /* Override the defaults for this instruction:
1517        */
1518       brw_set_dest(p, insn, brw_null_reg());
1519       brw_set_src0(p, insn, brw_null_reg());
1520       brw_set_src1(p, insn, brw_null_reg());
1521
1522       insn->header.compression_control = BRW_COMPRESSION_NONE;
1523       insn->header.execution_size = execute_size;
1524       insn->header.predicate_control = BRW_PREDICATE_NONE;
1525       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1526       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1527
1528       return insn;
1529    }
1530 }
1531
1532 /**
1533  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1534  * instruction here.
1535  *
1536  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1537  * nesting, since it can always just point to the end of the block/current loop.
1538  */
1539 static void
1540 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1541 {
1542    struct intel_context *intel = &p->brw->intel;
1543    struct brw_instruction *do_inst = get_inner_do_insn(p);
1544    struct brw_instruction *inst;
1545    int br = (intel->gen == 5) ? 2 : 1;
1546
1547    for (inst = while_inst - 1; inst != do_inst; inst--) {
1548       /* If the jump count is != 0, that means that this instruction has already
1549        * been patched because it's part of a loop inside of the one we're
1550        * patching.
1551        */
1552       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1553           inst->bits3.if_else.jump_count == 0) {
1554          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1555       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1556                  inst->bits3.if_else.jump_count == 0) {
1557          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1558       }
1559    }
1560 }
1561
1562 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1563 {
1564    struct intel_context *intel = &p->brw->intel;
1565    struct brw_instruction *insn, *do_insn;
1566    GLuint br = 1;
1567
1568    if (intel->gen >= 5)
1569       br = 2;
1570
1571    if (intel->gen >= 7) {
1572       insn = next_insn(p, BRW_OPCODE_WHILE);
1573       do_insn = get_inner_do_insn(p);
1574
1575       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1577       brw_set_src1(p, insn, brw_imm_ud(0));
1578       insn->bits3.break_cont.jip = br * (do_insn - insn);
1579
1580       insn->header.execution_size = BRW_EXECUTE_8;
1581    } else if (intel->gen == 6) {
1582       insn = next_insn(p, BRW_OPCODE_WHILE);
1583       do_insn = get_inner_do_insn(p);
1584
1585       brw_set_dest(p, insn, brw_imm_w(0));
1586       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1587       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1588       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589
1590       insn->header.execution_size = BRW_EXECUTE_8;
1591    } else {
1592       if (p->single_program_flow) {
1593          insn = next_insn(p, BRW_OPCODE_ADD);
1594          do_insn = get_inner_do_insn(p);
1595
1596          brw_set_dest(p, insn, brw_ip_reg());
1597          brw_set_src0(p, insn, brw_ip_reg());
1598          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1599          insn->header.execution_size = BRW_EXECUTE_1;
1600       } else {
1601          insn = next_insn(p, BRW_OPCODE_WHILE);
1602          do_insn = get_inner_do_insn(p);
1603
1604          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1605
1606          brw_set_dest(p, insn, brw_ip_reg());
1607          brw_set_src0(p, insn, brw_ip_reg());
1608          brw_set_src1(p, insn, brw_imm_d(0));
1609
1610          insn->header.execution_size = do_insn->header.execution_size;
1611          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1612          insn->bits3.if_else.pop_count = 0;
1613          insn->bits3.if_else.pad0 = 0;
1614
1615          brw_patch_break_cont(p, insn);
1616       }
1617    }
1618    insn->header.compression_control = BRW_COMPRESSION_NONE;
1619    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1620
1621    p->loop_stack_depth--;
1622
1623    return insn;
1624 }
1625
1626
1627 /* FORWARD JUMPS:
1628  */
1629 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1630 {
1631    struct intel_context *intel = &p->brw->intel;
1632    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1633    GLuint jmpi = 1;
1634
1635    if (intel->gen >= 5)
1636       jmpi = 2;
1637
1638    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1639    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1640
1641    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1642 }
1643
1644
1645
1646 /* To integrate with the above, it makes sense that the comparison
1647  * instruction should populate the flag register.  It might be simpler
1648  * just to use the flag reg for most WM tasks?
1649  */
1650 void brw_CMP(struct brw_compile *p,
1651              struct brw_reg dest,
1652              GLuint conditional,
1653              struct brw_reg src0,
1654              struct brw_reg src1)
1655 {
1656    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1657
1658    insn->header.destreg__conditionalmod = conditional;
1659    brw_set_dest(p, insn, dest);
1660    brw_set_src0(p, insn, src0);
1661    brw_set_src1(p, insn, src1);
1662
1663 /*    guess_execution_size(insn, src0); */
1664
1665
1666    /* Make it so that future instructions will use the computed flag
1667     * value until brw_set_predicate_control_flag_value() is called
1668     * again.
1669     */
1670    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1671        dest.nr == 0) {
1672       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1673       p->flag_value = 0xff;
1674    }
1675 }
1676
1677 /* Issue 'wait' instruction for n1, host could program MMIO
1678    to wake up thread. */
1679 void brw_WAIT (struct brw_compile *p)
1680 {
1681    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1682    struct brw_reg src = brw_notification_1_reg();
1683
1684    brw_set_dest(p, insn, src);
1685    brw_set_src0(p, insn, src);
1686    brw_set_src1(p, insn, brw_null_reg());
1687    insn->header.execution_size = 0; /* must */
1688    insn->header.predicate_control = 0;
1689    insn->header.compression_control = 0;
1690 }
1691
1692
1693 /***********************************************************************
1694  * Helpers for the various SEND message types:
1695  */
1696
1697 /** Extended math function, float[8].
1698  */
1699 void brw_math( struct brw_compile *p,
1700                struct brw_reg dest,
1701                GLuint function,
1702                GLuint msg_reg_nr,
1703                struct brw_reg src,
1704                GLuint data_type,
1705                GLuint precision )
1706 {
1707    struct intel_context *intel = &p->brw->intel;
1708
1709    if (intel->gen >= 6) {
1710       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1711
1712       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1713              (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1714       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1715
1716       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1717       if (intel->gen == 6)
1718          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1719
1720       /* Source modifiers are ignored for extended math instructions on Gen6. */
1721       if (intel->gen == 6) {
1722          assert(!src.negate);
1723          assert(!src.abs);
1724       }
1725
1726       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1727           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1728           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1729          assert(src.type != BRW_REGISTER_TYPE_F);
1730       } else {
1731          assert(src.type == BRW_REGISTER_TYPE_F);
1732       }
1733
1734       /* Math is the same ISA format as other opcodes, except that CondModifier
1735        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1736        */
1737       insn->header.destreg__conditionalmod = function;
1738
1739       brw_set_dest(p, insn, dest);
1740       brw_set_src0(p, insn, src);
1741       brw_set_src1(p, insn, brw_null_reg());
1742    } else {
1743       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1744
1745       /* Example code doesn't set predicate_control for send
1746        * instructions.
1747        */
1748       insn->header.predicate_control = 0;
1749       insn->header.destreg__conditionalmod = msg_reg_nr;
1750
1751       brw_set_dest(p, insn, dest);
1752       brw_set_src0(p, insn, src);
1753       brw_set_math_message(p,
1754                            insn,
1755                            function,
1756                            src.type == BRW_REGISTER_TYPE_D,
1757                            precision,
1758                            data_type);
1759    }
1760 }
1761
1762 /** Extended math function, float[8].
1763  */
1764 void brw_math2(struct brw_compile *p,
1765                struct brw_reg dest,
1766                GLuint function,
1767                struct brw_reg src0,
1768                struct brw_reg src1)
1769 {
1770    struct intel_context *intel = &p->brw->intel;
1771    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1772
1773    assert(intel->gen >= 6);
1774    (void) intel;
1775
1776
1777    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1778           (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1779    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1780    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1781
1782    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1783    if (intel->gen == 6) {
1784       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1785       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1786    }
1787
1788    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1789        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1790        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1791       assert(src0.type != BRW_REGISTER_TYPE_F);
1792       assert(src1.type != BRW_REGISTER_TYPE_F);
1793    } else {
1794       assert(src0.type == BRW_REGISTER_TYPE_F);
1795       assert(src1.type == BRW_REGISTER_TYPE_F);
1796    }
1797
1798    /* Source modifiers are ignored for extended math instructions on Gen6. */
1799    if (intel->gen == 6) {
1800       assert(!src0.negate);
1801       assert(!src0.abs);
1802       assert(!src1.negate);
1803       assert(!src1.abs);
1804    }
1805
1806    /* Math is the same ISA format as other opcodes, except that CondModifier
1807     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1808     */
1809    insn->header.destreg__conditionalmod = function;
1810
1811    brw_set_dest(p, insn, dest);
1812    brw_set_src0(p, insn, src0);
1813    brw_set_src1(p, insn, src1);
1814 }
1815
1816
1817 /**
1818  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1819  * using a constant offset per channel.
1820  *
1821  * The offset must be aligned to oword size (16 bytes).  Used for
1822  * register spilling.
1823  */
1824 void brw_oword_block_write_scratch(struct brw_compile *p,
1825                                    struct brw_reg mrf,
1826                                    int num_regs,
1827                                    GLuint offset)
1828 {
1829    struct intel_context *intel = &p->brw->intel;
1830    uint32_t msg_control, msg_type;
1831    int mlen;
1832
1833    if (intel->gen >= 6)
1834       offset /= 16;
1835
1836    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1837
1838    if (num_regs == 1) {
1839       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1840       mlen = 2;
1841    } else {
1842       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1843       mlen = 3;
1844    }
1845
1846    /* Set up the message header.  This is g0, with g0.2 filled with
1847     * the offset.  We don't want to leave our offset around in g0 or
1848     * it'll screw up texture samples, so set it up inside the message
1849     * reg.
1850     */
1851    {
1852       brw_push_insn_state(p);
1853       brw_set_mask_control(p, BRW_MASK_DISABLE);
1854       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1855
1856       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1857
1858       /* set message header global offset field (reg 0, element 2) */
1859       brw_MOV(p,
1860               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1861                                   mrf.nr,
1862                                   2), BRW_REGISTER_TYPE_UD),
1863               brw_imm_ud(offset));
1864
1865       brw_pop_insn_state(p);
1866    }
1867
1868    {
1869       struct brw_reg dest;
1870       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1871       int send_commit_msg;
1872       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1873                                          BRW_REGISTER_TYPE_UW);
1874
1875       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1876          insn->header.compression_control = BRW_COMPRESSION_NONE;
1877          src_header = vec16(src_header);
1878       }
1879       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1880       insn->header.destreg__conditionalmod = mrf.nr;
1881
1882       /* Until gen6, writes followed by reads from the same location
1883        * are not guaranteed to be ordered unless write_commit is set.
1884        * If set, then a no-op write is issued to the destination
1885        * register to set a dependency, and a read from the destination
1886        * can be used to ensure the ordering.
1887        *
1888        * For gen6, only writes between different threads need ordering
1889        * protection.  Our use of DP writes is all about register
1890        * spilling within a thread.
1891        */
1892       if (intel->gen >= 6) {
1893          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1894          send_commit_msg = 0;
1895       } else {
1896          dest = src_header;
1897          send_commit_msg = 1;
1898       }
1899
1900       brw_set_dest(p, insn, dest);
1901       if (intel->gen >= 6) {
1902          brw_set_src0(p, insn, mrf);
1903       } else {
1904          brw_set_src0(p, insn, brw_null_reg());
1905       }
1906
1907       if (intel->gen >= 6)
1908          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1909       else
1910          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1911
1912       brw_set_dp_write_message(p,
1913                                insn,
1914                                255, /* binding table index (255=stateless) */
1915                                msg_control,
1916                                msg_type,
1917                                mlen,
1918                                true, /* header_present */
1919                                0, /* not a render target */
1920                                send_commit_msg, /* response_length */
1921                                0, /* eot */
1922                                send_commit_msg);
1923    }
1924 }
1925
1926
1927 /**
1928  * Read a block of owords (half a GRF each) from the scratch buffer
1929  * using a constant index per channel.
1930  *
1931  * Offset must be aligned to oword size (16 bytes).  Used for register
1932  * spilling.
1933  */
1934 void
1935 brw_oword_block_read_scratch(struct brw_compile *p,
1936                              struct brw_reg dest,
1937                              struct brw_reg mrf,
1938                              int num_regs,
1939                              GLuint offset)
1940 {
1941    struct intel_context *intel = &p->brw->intel;
1942    uint32_t msg_control;
1943    int rlen;
1944
1945    if (intel->gen >= 6)
1946       offset /= 16;
1947
1948    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1949    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1950
1951    if (num_regs == 1) {
1952       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1953       rlen = 1;
1954    } else {
1955       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1956       rlen = 2;
1957    }
1958
1959    {
1960       brw_push_insn_state(p);
1961       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1962       brw_set_mask_control(p, BRW_MASK_DISABLE);
1963
1964       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1965
1966       /* set message header global offset field (reg 0, element 2) */
1967       brw_MOV(p,
1968               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1969                                   mrf.nr,
1970                                   2), BRW_REGISTER_TYPE_UD),
1971               brw_imm_ud(offset));
1972
1973       brw_pop_insn_state(p);
1974    }
1975
1976    {
1977       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1978
1979       assert(insn->header.predicate_control == 0);
1980       insn->header.compression_control = BRW_COMPRESSION_NONE;
1981       insn->header.destreg__conditionalmod = mrf.nr;
1982
1983       brw_set_dest(p, insn, dest);      /* UW? */
1984       if (intel->gen >= 6) {
1985          brw_set_src0(p, insn, mrf);
1986       } else {
1987          brw_set_src0(p, insn, brw_null_reg());
1988       }
1989
1990       brw_set_dp_read_message(p,
1991                               insn,
1992                               255, /* binding table index (255=stateless) */
1993                               msg_control,
1994                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1995                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1996                               1, /* msg_length */
1997                               true, /* header_present */
1998                               rlen);
1999    }
2000 }
2001
2002 /**
2003  * Read a float[4] vector from the data port Data Cache (const buffer).
2004  * Location (in buffer) should be a multiple of 16.
2005  * Used for fetching shader constants.
2006  */
2007 void brw_oword_block_read(struct brw_compile *p,
2008                           struct brw_reg dest,
2009                           struct brw_reg mrf,
2010                           uint32_t offset,
2011                           uint32_t bind_table_index)
2012 {
2013    struct intel_context *intel = &p->brw->intel;
2014
2015    /* On newer hardware, offset is in units of owords. */
2016    if (intel->gen >= 6)
2017       offset /= 16;
2018
2019    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2020
2021    brw_push_insn_state(p);
2022    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2023    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2024    brw_set_mask_control(p, BRW_MASK_DISABLE);
2025
2026    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2027
2028    /* set message header global offset field (reg 0, element 2) */
2029    brw_MOV(p,
2030            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2031                                mrf.nr,
2032                                2), BRW_REGISTER_TYPE_UD),
2033            brw_imm_ud(offset));
2034
2035    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2036    insn->header.destreg__conditionalmod = mrf.nr;
2037
2038    /* cast dest to a uword[8] vector */
2039    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2040
2041    brw_set_dest(p, insn, dest);
2042    if (intel->gen >= 6) {
2043       brw_set_src0(p, insn, mrf);
2044    } else {
2045       brw_set_src0(p, insn, brw_null_reg());
2046    }
2047
2048    brw_set_dp_read_message(p,
2049                            insn,
2050                            bind_table_index,
2051                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2052                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2053                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2054                            1, /* msg_length */
2055                            true, /* header_present */
2056                            1); /* response_length (1 reg, 2 owords!) */
2057
2058    brw_pop_insn_state(p);
2059 }
2060
2061
2062 void brw_fb_WRITE(struct brw_compile *p,
2063                   int dispatch_width,
2064                   GLuint msg_reg_nr,
2065                   struct brw_reg src0,
2066                   GLuint msg_control,
2067                   GLuint binding_table_index,
2068                   GLuint msg_length,
2069                   GLuint response_length,
2070                   bool eot,
2071                   bool header_present)
2072 {
2073    struct intel_context *intel = &p->brw->intel;
2074    struct brw_instruction *insn;
2075    GLuint msg_type;
2076    struct brw_reg dest;
2077
2078    if (dispatch_width == 16)
2079       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2080    else
2081       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2082
2083    if (intel->gen >= 6) {
2084       insn = next_insn(p, BRW_OPCODE_SENDC);
2085    } else {
2086       insn = next_insn(p, BRW_OPCODE_SEND);
2087    }
2088    /* The execution mask is ignored for render target writes. */
2089    insn->header.predicate_control = 0;
2090    insn->header.compression_control = BRW_COMPRESSION_NONE;
2091
2092    if (intel->gen >= 6) {
2093       /* headerless version, just submit color payload */
2094       src0 = brw_message_reg(msg_reg_nr);
2095
2096       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2097    } else {
2098       insn->header.destreg__conditionalmod = msg_reg_nr;
2099
2100       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2101    }
2102
2103    brw_set_dest(p, insn, dest);
2104    brw_set_src0(p, insn, src0);
2105    brw_set_dp_write_message(p,
2106                             insn,
2107                             binding_table_index,
2108                             msg_control,
2109                             msg_type,
2110                             msg_length,
2111                             header_present,
2112                             eot, /* last render target write */
2113                             response_length,
2114                             eot,
2115                             0 /* send_commit_msg */);
2116 }
2117
2118
2119 /**
2120  * Texture sample instruction.
2121  * Note: the msg_type plus msg_length values determine exactly what kind
2122  * of sampling operation is performed.  See volume 4, page 161 of docs.
2123  */
2124 void brw_SAMPLE(struct brw_compile *p,
2125                 struct brw_reg dest,
2126                 GLuint msg_reg_nr,
2127                 struct brw_reg src0,
2128                 GLuint binding_table_index,
2129                 GLuint sampler,
2130                 GLuint msg_type,
2131                 GLuint response_length,
2132                 GLuint msg_length,
2133                 GLuint header_present,
2134                 GLuint simd_mode,
2135                 GLuint return_format)
2136 {
2137    struct intel_context *intel = &p->brw->intel;
2138    struct brw_instruction *insn;
2139
2140    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2141
2142    insn = next_insn(p, BRW_OPCODE_SEND);
2143    insn->header.predicate_control = 0; /* XXX */
2144    insn->header.compression_control = BRW_COMPRESSION_NONE;
2145    if (intel->gen < 6)
2146       insn->header.destreg__conditionalmod = msg_reg_nr;
2147
2148    brw_set_dest(p, insn, dest);
2149    brw_set_src0(p, insn, src0);
2150    brw_set_sampler_message(p, insn,
2151                            binding_table_index,
2152                            sampler,
2153                            msg_type,
2154                            response_length,
2155                            msg_length,
2156                            header_present,
2157                            simd_mode,
2158                            return_format);
2159 }
2160
2161 /* All these variables are pretty confusing - we might be better off
2162  * using bitmasks and macros for this, in the old style.  Or perhaps
2163  * just having the caller instantiate the fields in dword3 itself.
2164  */
2165 void brw_urb_WRITE(struct brw_compile *p,
2166                    struct brw_reg dest,
2167                    GLuint msg_reg_nr,
2168                    struct brw_reg src0,
2169                    bool allocate,
2170                    bool used,
2171                    GLuint msg_length,
2172                    GLuint response_length,
2173                    bool eot,
2174                    bool writes_complete,
2175                    GLuint offset,
2176                    GLuint swizzle)
2177 {
2178    struct intel_context *intel = &p->brw->intel;
2179    struct brw_instruction *insn;
2180
2181    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2182
2183    if (intel->gen == 7) {
2184       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2185       brw_push_insn_state(p);
2186       brw_set_access_mode(p, BRW_ALIGN_1);
2187       brw_set_mask_control(p, BRW_MASK_DISABLE);
2188       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2189                        BRW_REGISTER_TYPE_UD),
2190                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2191                 brw_imm_ud(0xff00));
2192       brw_pop_insn_state(p);
2193    }
2194
2195    insn = next_insn(p, BRW_OPCODE_SEND);
2196
2197    assert(msg_length < BRW_MAX_MRF);
2198
2199    brw_set_dest(p, insn, dest);
2200    brw_set_src0(p, insn, src0);
2201    brw_set_src1(p, insn, brw_imm_d(0));
2202
2203    if (intel->gen < 6)
2204       insn->header.destreg__conditionalmod = msg_reg_nr;
2205
2206    brw_set_urb_message(p,
2207                        insn,
2208                        allocate,
2209                        used,
2210                        msg_length,
2211                        response_length,
2212                        eot,
2213                        writes_complete,
2214                        offset,
2215                        swizzle);
2216 }
2217
2218 static int
2219 next_ip(struct brw_compile *p, int ip)
2220 {
2221    struct brw_instruction *insn = (void *)p->store + ip;
2222
2223    if (insn->header.cmpt_control)
2224       return ip + 8;
2225    else
2226       return ip + 16;
2227 }
2228
2229 static int
2230 brw_find_next_block_end(struct brw_compile *p, int start)
2231 {
2232    int ip;
2233    void *store = p->store;
2234
2235    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2236       struct brw_instruction *insn = store + ip;
2237
2238       switch (insn->header.opcode) {
2239       case BRW_OPCODE_ENDIF:
2240       case BRW_OPCODE_ELSE:
2241       case BRW_OPCODE_WHILE:
2242       case BRW_OPCODE_HALT:
2243          return ip;
2244       }
2245    }
2246
2247    return 0;
2248 }
2249
2250 /* There is no DO instruction on gen6, so to find the end of the loop
2251  * we have to see if the loop is jumping back before our start
2252  * instruction.
2253  */
2254 static int
2255 brw_find_loop_end(struct brw_compile *p, int start)
2256 {
2257    struct intel_context *intel = &p->brw->intel;
2258    int ip;
2259    int scale = 8;
2260    void *store = p->store;
2261
2262    /* Always start after the instruction (such as a WHILE) we're trying to fix
2263     * up.
2264     */
2265    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2266       struct brw_instruction *insn = store + ip;
2267
2268       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2269          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2270                                    : insn->bits3.break_cont.jip;
2271          if (ip + jip * scale <= start)
2272             return ip;
2273       }
2274    }
2275    assert(!"not reached");
2276    return start;
2277 }
2278
2279 /* After program generation, go back and update the UIP and JIP of
2280  * BREAK, CONT, and HALT instructions to their correct locations.
2281  */
2282 void
2283 brw_set_uip_jip(struct brw_compile *p)
2284 {
2285    struct intel_context *intel = &p->brw->intel;
2286    int ip;
2287    int scale = 8;
2288    void *store = p->store;
2289
2290    if (intel->gen < 6)
2291       return;
2292
2293    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2294       struct brw_instruction *insn = store + ip;
2295
2296       if (insn->header.cmpt_control) {
2297          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2298          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2299                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2300                 insn->header.opcode != BRW_OPCODE_HALT);
2301          continue;
2302       }
2303
2304       int block_end_ip = brw_find_next_block_end(p, ip);
2305       switch (insn->header.opcode) {
2306       case BRW_OPCODE_BREAK:
2307          assert(block_end_ip != 0);
2308          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2309          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2310          insn->bits3.break_cont.uip =
2311             (brw_find_loop_end(p, ip) - ip +
2312              (intel->gen == 6 ? 16 : 0)) / scale;
2313          break;
2314       case BRW_OPCODE_CONTINUE:
2315          assert(block_end_ip != 0);
2316          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2317          insn->bits3.break_cont.uip =
2318             (brw_find_loop_end(p, ip) - ip) / scale;
2319
2320          assert(insn->bits3.break_cont.uip != 0);
2321          assert(insn->bits3.break_cont.jip != 0);
2322          break;
2323
2324       case BRW_OPCODE_ENDIF:
2325          if (block_end_ip == 0)
2326             insn->bits3.break_cont.jip = 2;
2327          else
2328             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2329          break;
2330
2331       case BRW_OPCODE_HALT:
2332          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2333           *
2334           *    "In case of the halt instruction not inside any conditional
2335           *     code block, the value of <JIP> and <UIP> should be the
2336           *     same. In case of the halt instruction inside conditional code
2337           *     block, the <UIP> should be the end of the program, and the
2338           *     <JIP> should be end of the most inner conditional code block."
2339           *
2340           * The uip will have already been set by whoever set up the
2341           * instruction.
2342           */
2343          if (block_end_ip == 0) {
2344             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2345          } else {
2346             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2347          }
2348          assert(insn->bits3.break_cont.uip != 0);
2349          assert(insn->bits3.break_cont.jip != 0);
2350          break;
2351       }
2352    }
2353 }
2354
2355 void brw_ff_sync(struct brw_compile *p,
2356                    struct brw_reg dest,
2357                    GLuint msg_reg_nr,
2358                    struct brw_reg src0,
2359                    bool allocate,
2360                    GLuint response_length,
2361                    bool eot)
2362 {
2363    struct intel_context *intel = &p->brw->intel;
2364    struct brw_instruction *insn;
2365
2366    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2367
2368    insn = next_insn(p, BRW_OPCODE_SEND);
2369    brw_set_dest(p, insn, dest);
2370    brw_set_src0(p, insn, src0);
2371    brw_set_src1(p, insn, brw_imm_d(0));
2372
2373    if (intel->gen < 6)
2374       insn->header.destreg__conditionalmod = msg_reg_nr;
2375
2376    brw_set_ff_sync_message(p,
2377                            insn,
2378                            allocate,
2379                            response_length,
2380                            eot);
2381 }
2382
2383 /**
2384  * Emit the SEND instruction necessary to generate stream output data on Gen6
2385  * (for transform feedback).
2386  *
2387  * If send_commit_msg is true, this is the last piece of stream output data
2388  * from this thread, so send the data as a committed write.  According to the
2389  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2390  *
2391  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2392  *   writes are complete by sending the final write as a committed write."
2393  */
2394 void
2395 brw_svb_write(struct brw_compile *p,
2396               struct brw_reg dest,
2397               GLuint msg_reg_nr,
2398               struct brw_reg src0,
2399               GLuint binding_table_index,
2400               bool   send_commit_msg)
2401 {
2402    struct brw_instruction *insn;
2403
2404    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2405
2406    insn = next_insn(p, BRW_OPCODE_SEND);
2407    brw_set_dest(p, insn, dest);
2408    brw_set_src0(p, insn, src0);
2409    brw_set_src1(p, insn, brw_imm_d(0));
2410    brw_set_dp_write_message(p, insn,
2411                             binding_table_index,
2412                             0, /* msg_control: ignored */
2413                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2414                             1, /* msg_length */
2415                             true, /* header_present */
2416                             0, /* last_render_target: ignored */
2417                             send_commit_msg, /* response_length */
2418                             0, /* end_of_thread */
2419                             send_commit_msg); /* send_commit_msg */
2420 }
2421
2422 /**
2423  * This instruction is generated as a single-channel align1 instruction by
2424  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2425  *
2426  * We can't use the typed atomic op in the FS because that has the execution
2427  * mask ANDed with the pixel mask, but we just want to write the one dword for
2428  * all the pixels.
2429  *
2430  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2431  * one u32.  So we use the same untyped atomic write message as the pixel
2432  * shader.
2433  *
2434  * The untyped atomic operation requires a BUFFER surface type with RAW
2435  * format, and is only accessible through the legacy DATA_CACHE dataport
2436  * messages.
2437  */
2438 void brw_shader_time_add(struct brw_compile *p,
2439                          int base_mrf,
2440                          uint32_t surf_index)
2441 {
2442    struct intel_context *intel = &p->brw->intel;
2443    assert(intel->gen >= 7);
2444
2445    brw_push_insn_state(p);
2446    brw_set_access_mode(p, BRW_ALIGN_1);
2447    brw_set_mask_control(p, BRW_MASK_DISABLE);
2448    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2449    brw_pop_insn_state(p);
2450
2451    /* We use brw_vec1_reg and unmasked because we want to increment the given
2452     * offset only once.
2453     */
2454    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2455                                       BRW_ARF_NULL, 0));
2456    brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2457                                       base_mrf, 0));
2458
2459    uint32_t sfid, msg_type;
2460    if (intel->is_haswell) {
2461       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2462       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2463    } else {
2464       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2465       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2466    }
2467
2468    bool header_present = false;
2469    bool eot = false;
2470    uint32_t mlen = 2; /* offset, value */
2471    uint32_t rlen = 0;
2472    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2473
2474    send->bits3.ud |= msg_type << 14;
2475    send->bits3.ud |= 0 << 13; /* no return data */
2476    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2477    send->bits3.ud |= BRW_AOP_ADD << 8;
2478    send->bits3.ud |= surf_index << 0;
2479 }