src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the BSpec / ISA Reference / send - [DevIVB+]:
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct intel_context *intel = &p->brw->intel;
  96    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 131           *    this to be programmed as "01".
 132           */
 133          insn->bits1.da16.dest_horiz_stride = 1;
 134       }
 135    }
 136    else {
 137       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 138
 139       /* These are different sizes in align1 vs align16:
 140        */
 141       if (insn->header.access_mode == BRW_ALIGN_1) {
 142          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 143          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 144             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 145          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 146       }
 147       else {
 148          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 149          /* even ignored in da16, still need to set as '01' */
 150          insn->bits1.ia16.dest_horiz_stride = 1;
 151       }
 152    }
 153
 154    /* NEW: Set the execution size based on dest.width and
 155     * insn->compression_control:
 156     */
 157    guess_execution_size(p, insn, dest);
 158 }
 159
 160 extern int reg_type_size[];
 161
 162 static void
 163 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 164 {
 165    int hstride_for_reg[] = {0, 1, 2, 4};
 166    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 167    int width_for_reg[] = {1, 2, 4, 8, 16};
 168    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 169    int width, hstride, vstride, execsize;
 170
 171    if (reg.file == BRW_IMMEDIATE_VALUE) {
 172       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 173        * mean the destination has to be 128-bit aligned and the
 174        * destination horiz stride has to be a word.
 175        */
 176       if (reg.type == BRW_REGISTER_TYPE_V) {
 177          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 178                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 179       }
 180
 181       return;
 182    }
 183
 184    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 185        reg.file == BRW_ARF_NULL)
 186       return;
 187
 188    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 189    hstride = hstride_for_reg[reg.hstride];
 190
 191    if (reg.vstride == 0xf) {
 192       vstride = -1;
 193    } else {
 194       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 195       vstride = vstride_for_reg[reg.vstride];
 196    }
 197
 198    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 199    width = width_for_reg[reg.width];
 200
 201    assert(insn->header.execution_size >= 0 &&
 202           insn->header.execution_size < Elements(execsize_for_reg));
 203    execsize = execsize_for_reg[insn->header.execution_size];
 204
 205    /* Restrictions from 3.3.10: Register Region Restrictions. */
 206    /* 3. */
 207    assert(execsize >= width);
 208
 209    /* 4. */
 210    if (execsize == width && hstride != 0) {
 211       assert(vstride == -1 || vstride == width * hstride);
 212    }
 213
 214    /* 5. */
 215    if (execsize == width && hstride == 0) {
 216       /* no restriction on vstride. */
 217    }
 218
 219    /* 6. */
 220    if (width == 1) {
 221       assert(hstride == 0);
 222    }
 223
 224    /* 7. */
 225    if (execsize == 1 && width == 1) {
 226       assert(hstride == 0);
 227       assert(vstride == 0);
 228    }
 229
 230    /* 8. */
 231    if (vstride == 0 && hstride == 0) {
 232       assert(width == 1);
 233    }
 234
 235    /* 10. Check destination issues. */
 236 }
 237
 238 void
 239 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 240              struct brw_reg reg)
 241 {
 242    struct brw_context *brw = p->brw;
 243    struct intel_context *intel = &brw->intel;
 244
 245    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 246       assert(reg.nr < 128);
 247
 248    gen7_convert_mrf_to_grf(p, &reg);
 249
 250    if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 251                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 252       /* Any source modifiers or regions will be ignored, since this just
 253        * identifies the MRF/GRF to start reading the message contents from.
 254        * Check for some likely failures.
 255        */
 256       assert(!reg.negate);
 257       assert(!reg.abs);
 258       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 259    }
 260
 261    validate_reg(insn, reg);
 262
 263    insn->bits1.da1.src0_reg_file = reg.file;
 264    insn->bits1.da1.src0_reg_type = reg.type;
 265    insn->bits2.da1.src0_abs = reg.abs;
 266    insn->bits2.da1.src0_negate = reg.negate;
 267    insn->bits2.da1.src0_address_mode = reg.address_mode;
 268
 269    if (reg.file == BRW_IMMEDIATE_VALUE) {
 270       insn->bits3.ud = reg.dw1.ud;
 271
 272       /* Required to set some fields in src1 as well:
 273        */
 274       insn->bits1.da1.src1_reg_file = 0; /* arf */
 275       insn->bits1.da1.src1_reg_type = reg.type;
 276    }
 277    else
 278    {
 279       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 280          if (insn->header.access_mode == BRW_ALIGN_1) {
 281             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 282             insn->bits2.da1.src0_reg_nr = reg.nr;
 283          }
 284          else {
 285             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 286             insn->bits2.da16.src0_reg_nr = reg.nr;
 287          }
 288       }
 289       else {
 290          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 291
 292          if (insn->header.access_mode == BRW_ALIGN_1) {
 293             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 294          }
 295          else {
 296             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 297          }
 298       }
 299
 300       if (insn->header.access_mode == BRW_ALIGN_1) {
 301          if (reg.width == BRW_WIDTH_1 &&
 302              insn->header.execution_size == BRW_EXECUTE_1) {
 303             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 304             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 305             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 306          }
 307          else {
 308             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 309             insn->bits2.da1.src0_width = reg.width;
 310             insn->bits2.da1.src0_vert_stride = reg.vstride;
 311          }
 312       }
 313       else {
 314          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 315          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 316          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 317          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 318
 319          /* This is an oddity of the fact we're using the same
 320           * descriptions for registers in align_16 as align_1:
 321           */
 322          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 323             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 324          else
 325             insn->bits2.da16.src0_vert_stride = reg.vstride;
 326       }
 327    }
 328 }
 329
 330
 331 void brw_set_src1(struct brw_compile *p,
 332                   struct brw_instruction *insn,
 333                   struct brw_reg reg)
 334 {
 335    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 336
 337    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 338       assert(reg.nr < 128);
 339
 340    gen7_convert_mrf_to_grf(p, &reg);
 341
 342    validate_reg(insn, reg);
 343
 344    insn->bits1.da1.src1_reg_file = reg.file;
 345    insn->bits1.da1.src1_reg_type = reg.type;
 346    insn->bits3.da1.src1_abs = reg.abs;
 347    insn->bits3.da1.src1_negate = reg.negate;
 348
 349    /* Only src1 can be immediate in two-argument instructions.
 350     */
 351    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 352
 353    if (reg.file == BRW_IMMEDIATE_VALUE) {
 354       insn->bits3.ud = reg.dw1.ud;
 355    }
 356    else {
 357       /* This is a hardware restriction, which may or may not be lifted
 358        * in the future:
 359        */
 360       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 361       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 362
 363       if (insn->header.access_mode == BRW_ALIGN_1) {
 364          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 365          insn->bits3.da1.src1_reg_nr = reg.nr;
 366       }
 367       else {
 368          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 369          insn->bits3.da16.src1_reg_nr = reg.nr;
 370       }
 371
 372       if (insn->header.access_mode == BRW_ALIGN_1) {
 373          if (reg.width == BRW_WIDTH_1 &&
 374              insn->header.execution_size == BRW_EXECUTE_1) {
 375             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 376             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 377             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 378          }
 379          else {
 380             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 381             insn->bits3.da1.src1_width = reg.width;
 382             insn->bits3.da1.src1_vert_stride = reg.vstride;
 383          }
 384       }
 385       else {
 386          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 387          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 388          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 389          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 390
 391          /* This is an oddity of the fact we're using the same
 392           * descriptions for registers in align_16 as align_1:
 393           */
 394          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 395             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 396          else
 397             insn->bits3.da16.src1_vert_stride = reg.vstride;
 398       }
 399    }
 400 }
 401
 402 /**
 403  * Set the Message Descriptor and Extended Message Descriptor fields
 404  * for SEND messages.
 405  *
 406  * \note This zeroes out the Function Control bits, so it must be called
 407  *       \b before filling out any message-specific data.  Callers can
 408  *       choose not to fill in irrelevant bits; they will be zero.
 409  */
 410 static void
 411 brw_set_message_descriptor(struct brw_compile *p,
 412                            struct brw_instruction *inst,
 413                            enum brw_message_target sfid,
 414                            unsigned msg_length,
 415                            unsigned response_length,
 416                            bool header_present,
 417                            bool end_of_thread)
 418 {
 419    struct intel_context *intel = &p->brw->intel;
 420
 421    brw_set_src1(p, inst, brw_imm_d(0));
 422
 423    if (intel->gen >= 5) {
 424       inst->bits3.generic_gen5.header_present = header_present;
 425       inst->bits3.generic_gen5.response_length = response_length;
 426       inst->bits3.generic_gen5.msg_length = msg_length;
 427       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 428
 429       if (intel->gen >= 6) {
 430          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 431          inst->header.destreg__conditionalmod = sfid;
 432       } else {
 433          /* Set Extended Message Descriptor (ex_desc) */
 434          inst->bits2.send_gen5.sfid = sfid;
 435          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 436       }
 437    } else {
 438       inst->bits3.generic.response_length = response_length;
 439       inst->bits3.generic.msg_length = msg_length;
 440       inst->bits3.generic.msg_target = sfid;
 441       inst->bits3.generic.end_of_thread = end_of_thread;
 442    }
 443 }
 444
 445 static void brw_set_math_message( struct brw_compile *p,
 446                                   struct brw_instruction *insn,
 447                                   GLuint function,
 448                                   GLuint integer_type,
 449                                   bool low_precision,
 450                                   GLuint dataType )
 451 {
 452    struct brw_context *brw = p->brw;
 453    struct intel_context *intel = &brw->intel;
 454    unsigned msg_length;
 455    unsigned response_length;
 456
 457    /* Infer message length from the function */
 458    switch (function) {
 459    case BRW_MATH_FUNCTION_POW:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 461    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 462    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 463       msg_length = 2;
 464       break;
 465    default:
 466       msg_length = 1;
 467       break;
 468    }
 469
 470    /* Infer response length from the function */
 471    switch (function) {
 472    case BRW_MATH_FUNCTION_SINCOS:
 473    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 474       response_length = 2;
 475       break;
 476    default:
 477       response_length = 1;
 478       break;
 479    }
 480
 481
 482    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 483                               msg_length, response_length, false, false);
 484    if (intel->gen == 5) {
 485       insn->bits3.math_gen5.function = function;
 486       insn->bits3.math_gen5.int_type = integer_type;
 487       insn->bits3.math_gen5.precision = low_precision;
 488       insn->bits3.math_gen5.saturate = insn->header.saturate;
 489       insn->bits3.math_gen5.data_type = dataType;
 490       insn->bits3.math_gen5.snapshot = 0;
 491    } else {
 492       insn->bits3.math.function = function;
 493       insn->bits3.math.int_type = integer_type;
 494       insn->bits3.math.precision = low_precision;
 495       insn->bits3.math.saturate = insn->header.saturate;
 496       insn->bits3.math.data_type = dataType;
 497    }
 498    insn->header.saturate = 0;
 499 }
 500
 501
 502 static void brw_set_ff_sync_message(struct brw_compile *p,
 503                                     struct brw_instruction *insn,
 504                                     bool allocate,
 505                                     GLuint response_length,
 506                                     bool end_of_thread)
 507 {
 508    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 509                               1, response_length, true, end_of_thread);
 510    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 511    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 512    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.allocate = allocate;
 514    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 515    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 516 }
 517
 518 static void brw_set_urb_message( struct brw_compile *p,
 519                                  struct brw_instruction *insn,
 520                                  bool allocate,
 521                                  bool used,
 522                                  GLuint msg_length,
 523                                  GLuint response_length,
 524                                  bool end_of_thread,
 525                                  bool complete,
 526                                  GLuint offset,
 527                                  GLuint swizzle_control )
 528 {
 529    struct brw_context *brw = p->brw;
 530    struct intel_context *intel = &brw->intel;
 531
 532    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 533                               msg_length, response_length, true, end_of_thread);
 534    if (intel->gen == 7) {
 535       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 536       insn->bits3.urb_gen7.offset = offset;
 537       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 538       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 539       /* per_slot_offset = 0 makes it ignore offsets in message header */
 540       insn->bits3.urb_gen7.per_slot_offset = 0;
 541       insn->bits3.urb_gen7.complete = complete;
 542    } else if (intel->gen >= 5) {
 543       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 544       insn->bits3.urb_gen5.offset = offset;
 545       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 546       insn->bits3.urb_gen5.allocate = allocate;
 547       insn->bits3.urb_gen5.used = used; /* ? */
 548       insn->bits3.urb_gen5.complete = complete;
 549    } else {
 550       insn->bits3.urb.opcode = 0;       /* ? */
 551       insn->bits3.urb.offset = offset;
 552       insn->bits3.urb.swizzle_control = swizzle_control;
 553       insn->bits3.urb.allocate = allocate;
 554       insn->bits3.urb.used = used;      /* ? */
 555       insn->bits3.urb.complete = complete;
 556    }
 557 }
 558
 559 void
 560 brw_set_dp_write_message(struct brw_compile *p,
 561                          struct brw_instruction *insn,
 562                          GLuint binding_table_index,
 563                          GLuint msg_control,
 564                          GLuint msg_type,
 565                          GLuint msg_length,
 566                          bool header_present,
 567                          GLuint last_render_target,
 568                          GLuint response_length,
 569                          GLuint end_of_thread,
 570                          GLuint send_commit_msg)
 571 {
 572    struct brw_context *brw = p->brw;
 573    struct intel_context *intel = &brw->intel;
 574    unsigned sfid;
 575
 576    if (intel->gen >= 7) {
 577       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 578       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 579          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 580       else
 581          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 582    } else if (intel->gen == 6) {
 583       /* Use the render cache for all write messages. */
 584       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 585    } else {
 586       sfid = BRW_SFID_DATAPORT_WRITE;
 587    }
 588
 589    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 590                               header_present, end_of_thread);
 591
 592    if (intel->gen >= 7) {
 593       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 594       insn->bits3.gen7_dp.msg_control = msg_control;
 595       insn->bits3.gen7_dp.last_render_target = last_render_target;
 596       insn->bits3.gen7_dp.msg_type = msg_type;
 597    } else if (intel->gen == 6) {
 598       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 599       insn->bits3.gen6_dp.msg_control = msg_control;
 600       insn->bits3.gen6_dp.last_render_target = last_render_target;
 601       insn->bits3.gen6_dp.msg_type = msg_type;
 602       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 603    } else if (intel->gen == 5) {
 604       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 605       insn->bits3.dp_write_gen5.msg_control = msg_control;
 606       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 607       insn->bits3.dp_write_gen5.msg_type = msg_type;
 608       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 609    } else {
 610       insn->bits3.dp_write.binding_table_index = binding_table_index;
 611       insn->bits3.dp_write.msg_control = msg_control;
 612       insn->bits3.dp_write.last_render_target = last_render_target;
 613       insn->bits3.dp_write.msg_type = msg_type;
 614       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 615    }
 616 }
 617
 618 void
 619 brw_set_dp_read_message(struct brw_compile *p,
 620                         struct brw_instruction *insn,
 621                         GLuint binding_table_index,
 622                         GLuint msg_control,
 623                         GLuint msg_type,
 624                         GLuint target_cache,
 625                         GLuint msg_length,
 626                         bool header_present,
 627                         GLuint response_length)
 628 {
 629    struct brw_context *brw = p->brw;
 630    struct intel_context *intel = &brw->intel;
 631    unsigned sfid;
 632
 633    if (intel->gen >= 7) {
 634       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 635    } else if (intel->gen == 6) {
 636       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 637          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 638       else
 639          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 640    } else {
 641       sfid = BRW_SFID_DATAPORT_READ;
 642    }
 643
 644    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 645                               header_present, false);
 646
 647    if (intel->gen >= 7) {
 648       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 649       insn->bits3.gen7_dp.msg_control = msg_control;
 650       insn->bits3.gen7_dp.last_render_target = 0;
 651       insn->bits3.gen7_dp.msg_type = msg_type;
 652    } else if (intel->gen == 6) {
 653       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 654       insn->bits3.gen6_dp.msg_control = msg_control;
 655       insn->bits3.gen6_dp.last_render_target = 0;
 656       insn->bits3.gen6_dp.msg_type = msg_type;
 657       insn->bits3.gen6_dp.send_commit_msg = 0;
 658    } else if (intel->gen == 5) {
 659       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 660       insn->bits3.dp_read_gen5.msg_control = msg_control;
 661       insn->bits3.dp_read_gen5.msg_type = msg_type;
 662       insn->bits3.dp_read_gen5.target_cache = target_cache;
 663    } else if (intel->is_g4x) {
 664       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 665       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 666       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 667       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 668    } else {
 669       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 670       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 671       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 672       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 673    }
 674 }
 675
 676 void
 677 brw_set_sampler_message(struct brw_compile *p,
 678                         struct brw_instruction *insn,
 679                         GLuint binding_table_index,
 680                         GLuint sampler,
 681                         GLuint msg_type,
 682                         GLuint response_length,
 683                         GLuint msg_length,
 684                         GLuint header_present,
 685                         GLuint simd_mode,
 686                         GLuint return_format)
 687 {
 688    struct brw_context *brw = p->brw;
 689    struct intel_context *intel = &brw->intel;
 690
 691    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 692                               response_length, header_present, false);
 693
 694    if (intel->gen >= 7) {
 695       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 696       insn->bits3.sampler_gen7.sampler = sampler;
 697       insn->bits3.sampler_gen7.msg_type = msg_type;
 698       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 699    } else if (intel->gen >= 5) {
 700       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 701       insn->bits3.sampler_gen5.sampler = sampler;
 702       insn->bits3.sampler_gen5.msg_type = msg_type;
 703       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 704    } else if (intel->is_g4x) {
 705       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 706       insn->bits3.sampler_g4x.sampler = sampler;
 707       insn->bits3.sampler_g4x.msg_type = msg_type;
 708    } else {
 709       insn->bits3.sampler.binding_table_index = binding_table_index;
 710       insn->bits3.sampler.sampler = sampler;
 711       insn->bits3.sampler.msg_type = msg_type;
 712       insn->bits3.sampler.return_format = return_format;
 713    }
 714 }
 715
 716
 717 #define next_insn brw_next_insn
 718 struct brw_instruction *
 719 brw_next_insn(struct brw_compile *p, GLuint opcode)
 720 {
 721    struct brw_instruction *insn;
 722
 723    if (p->nr_insn + 1 > p->store_size) {
 724       if (0)
 725          printf("incresing the store size to %d\n", p->store_size << 1);
 726       p->store_size <<= 1;
 727       p->store = reralloc(p->mem_ctx, p->store,
 728                           struct brw_instruction, p->store_size);
 729       if (!p->store)
 730          assert(!"realloc eu store memeory failed");
 731    }
 732
 733    p->next_insn_offset += 16;
 734    insn = &p->store[p->nr_insn++];
 735    memcpy(insn, p->current, sizeof(*insn));
 736
 737    /* Reset this one-shot flag:
 738     */
 739
 740    if (p->current->header.destreg__conditionalmod) {
 741       p->current->header.destreg__conditionalmod = 0;
 742       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 743    }
 744
 745    insn->header.opcode = opcode;
 746    return insn;
 747 }
 748
 749 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 750                                          GLuint opcode,
 751                                          struct brw_reg dest,
 752                                          struct brw_reg src )
 753 {
 754    struct brw_instruction *insn = next_insn(p, opcode);
 755    brw_set_dest(p, insn, dest);
 756    brw_set_src0(p, insn, src);
 757    return insn;
 758 }
 759
 760 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 761                                         GLuint opcode,
 762                                         struct brw_reg dest,
 763                                         struct brw_reg src0,
 764                                         struct brw_reg src1 )
 765 {
 766    struct brw_instruction *insn = next_insn(p, opcode);
 767    brw_set_dest(p, insn, dest);
 768    brw_set_src0(p, insn, src0);
 769    brw_set_src1(p, insn, src1);
 770    return insn;
 771 }
 772
 773 static int
 774 get_3src_subreg_nr(struct brw_reg reg)
 775 {
 776    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 777       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 778       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 779    } else {
 780       return reg.subnr / 4;
 781    }
 782 }
 783
 784 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 785                                         GLuint opcode,
 786                                         struct brw_reg dest,
 787                                         struct brw_reg src0,
 788                                         struct brw_reg src1,
 789                                         struct brw_reg src2)
 790 {
 791    struct intel_context *intel = &p->brw->intel;
 792    struct brw_instruction *insn = next_insn(p, opcode);
 793
 794    gen7_convert_mrf_to_grf(p, &dest);
 795
 796    assert(insn->header.access_mode == BRW_ALIGN_16);
 797
 798    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 799           dest.file == BRW_MESSAGE_REGISTER_FILE);
 800    assert(dest.nr < 128);
 801    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 802    assert(dest.type == BRW_REGISTER_TYPE_F ||
 803           dest.type == BRW_REGISTER_TYPE_D ||
 804           dest.type == BRW_REGISTER_TYPE_UD);
 805    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 806    insn->bits1.da3src.dest_reg_nr = dest.nr;
 807    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 808    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 809    guess_execution_size(p, insn, dest);
 810
 811    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 812    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 813    assert(src0.nr < 128);
 814    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 815    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 816    insn->bits2.da3src.src0_reg_nr = src0.nr;
 817    insn->bits1.da3src.src0_abs = src0.abs;
 818    insn->bits1.da3src.src0_negate = src0.negate;
 819    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 820
 821    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 822    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 823    assert(src1.nr < 128);
 824    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 825    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 826    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 827    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 828    insn->bits3.da3src.src1_reg_nr = src1.nr;
 829    insn->bits1.da3src.src1_abs = src1.abs;
 830    insn->bits1.da3src.src1_negate = src1.negate;
 831
 832    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 833    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 834    assert(src2.nr < 128);
 835    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 836    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 837    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 838    insn->bits3.da3src.src2_reg_nr = src2.nr;
 839    insn->bits1.da3src.src2_abs = src2.abs;
 840    insn->bits1.da3src.src2_negate = src2.negate;
 841
 842    if (intel->gen >= 7) {
 843       /* Set both the source and destination types based on dest.type,
 844        * ignoring the source register types.  The MAD and LRP emitters ensure
 845        * that all four types are float.  The BFE and BFI2 emitters, however,
 846        * may send us mixed D and UD types and want us to ignore that and use
 847        * the destination type.
 848        */
 849       switch (dest.type) {
 850       case BRW_REGISTER_TYPE_F:
 851          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
 852          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
 853          break;
 854       case BRW_REGISTER_TYPE_D:
 855          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
 856          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
 857          break;
 858       case BRW_REGISTER_TYPE_UD:
 859          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
 860          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
 861          break;
 862       }
 863    }
 864
 865    return insn;
 866 }
 867
 868
 869 /***********************************************************************
 870  * Convenience routines.
 871  */
 872 #define ALU1(OP)                                        \
 873 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 874               struct brw_reg dest,                      \
 875               struct brw_reg src0)                      \
 876 {                                                       \
 877    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 878 }
 879
 880 #define ALU2(OP)                                        \
 881 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 882               struct brw_reg dest,                      \
 883               struct brw_reg src0,                      \
 884               struct brw_reg src1)                      \
 885 {                                                       \
 886    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 887 }
 888
 889 #define ALU3(OP)                                        \
 890 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 891               struct brw_reg dest,                      \
 892               struct brw_reg src0,                      \
 893               struct brw_reg src1,                      \
 894               struct brw_reg src2)                      \
 895 {                                                       \
 896    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 897 }
 898
 899 #define ALU3F(OP)                                               \
 900 struct brw_instruction *brw_##OP(struct brw_compile *p,         \
 901                                  struct brw_reg dest,           \
 902                                  struct brw_reg src0,           \
 903                                  struct brw_reg src1,           \
 904                                  struct brw_reg src2)           \
 905 {                                                               \
 906    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
 907    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
 908    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
 909    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
 910    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 911 }
 912
 913 /* Rounding operations (other than RNDD) require two instructions - the first
 914  * stores a rounded value (possibly the wrong way) in the dest register, but
 915  * also sets a per-channel "increment bit" in the flag register.  A predicated
 916  * add of 1.0 fixes dest to contain the desired result.
 917  *
 918  * Sandybridge and later appear to round correctly without an ADD.
 919  */
 920 #define ROUND(OP)                                                             \
 921 void brw_##OP(struct brw_compile *p,                                          \
 922               struct brw_reg dest,                                            \
 923               struct brw_reg src)                                             \
 924 {                                                                             \
 925    struct brw_instruction *rnd, *add;                                         \
 926    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 927    brw_set_dest(p, rnd, dest);                                                \
 928    brw_set_src0(p, rnd, src);                                                 \
 929                                                                               \
 930    if (p->brw->intel.gen < 6) {                                               \
 931       /* turn on round-increments */                                          \
 932       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 933       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 934       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 935    }                                                                          \
 936 }
 937
 938
 939 ALU1(MOV)
 940 ALU2(SEL)
 941 ALU1(NOT)
 942 ALU2(AND)
 943 ALU2(OR)
 944 ALU2(XOR)
 945 ALU2(SHR)
 946 ALU2(SHL)
 947 ALU2(RSR)
 948 ALU2(RSL)
 949 ALU2(ASR)
 950 ALU1(F32TO16)
 951 ALU1(F16TO32)
 952 ALU1(FRC)
 953 ALU1(RNDD)
 954 ALU2(MAC)
 955 ALU2(MACH)
 956 ALU1(LZD)
 957 ALU2(DP4)
 958 ALU2(DPH)
 959 ALU2(DP3)
 960 ALU2(DP2)
 961 ALU2(LINE)
 962 ALU2(PLN)
 963 ALU3F(MAD)
 964 ALU3F(LRP)
 965 ALU1(BFREV)
 966 ALU3(BFE)
 967 ALU2(BFI1)
 968 ALU3(BFI2)
 969 ALU1(FBH)
 970 ALU1(FBL)
 971 ALU1(CBIT)
 972
 973 ROUND(RNDZ)
 974 ROUND(RNDE)
 975
 976
 977 struct brw_instruction *brw_ADD(struct brw_compile *p,
 978                                 struct brw_reg dest,
 979                                 struct brw_reg src0,
 980                                 struct brw_reg src1)
 981 {
 982    /* 6.2.2: add */
 983    if (src0.type == BRW_REGISTER_TYPE_F ||
 984        (src0.file == BRW_IMMEDIATE_VALUE &&
 985         src0.type == BRW_REGISTER_TYPE_VF)) {
 986       assert(src1.type != BRW_REGISTER_TYPE_UD);
 987       assert(src1.type != BRW_REGISTER_TYPE_D);
 988    }
 989
 990    if (src1.type == BRW_REGISTER_TYPE_F ||
 991        (src1.file == BRW_IMMEDIATE_VALUE &&
 992         src1.type == BRW_REGISTER_TYPE_VF)) {
 993       assert(src0.type != BRW_REGISTER_TYPE_UD);
 994       assert(src0.type != BRW_REGISTER_TYPE_D);
 995    }
 996
 997    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 998 }
 999
1000 struct brw_instruction *brw_AVG(struct brw_compile *p,
1001                                 struct brw_reg dest,
1002                                 struct brw_reg src0,
1003                                 struct brw_reg src1)
1004 {
1005    assert(dest.type == src0.type);
1006    assert(src0.type == src1.type);
1007    switch (src0.type) {
1008    case BRW_REGISTER_TYPE_B:
1009    case BRW_REGISTER_TYPE_UB:
1010    case BRW_REGISTER_TYPE_W:
1011    case BRW_REGISTER_TYPE_UW:
1012    case BRW_REGISTER_TYPE_D:
1013    case BRW_REGISTER_TYPE_UD:
1014       break;
1015    default:
1016       assert(!"Bad type for brw_AVG");
1017    }
1018
1019    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1020 }
1021
1022 struct brw_instruction *brw_MUL(struct brw_compile *p,
1023                                 struct brw_reg dest,
1024                                 struct brw_reg src0,
1025                                 struct brw_reg src1)
1026 {
1027    /* 6.32.38: mul */
1028    if (src0.type == BRW_REGISTER_TYPE_D ||
1029        src0.type == BRW_REGISTER_TYPE_UD ||
1030        src1.type == BRW_REGISTER_TYPE_D ||
1031        src1.type == BRW_REGISTER_TYPE_UD) {
1032       assert(dest.type != BRW_REGISTER_TYPE_F);
1033    }
1034
1035    if (src0.type == BRW_REGISTER_TYPE_F ||
1036        (src0.file == BRW_IMMEDIATE_VALUE &&
1037         src0.type == BRW_REGISTER_TYPE_VF)) {
1038       assert(src1.type != BRW_REGISTER_TYPE_UD);
1039       assert(src1.type != BRW_REGISTER_TYPE_D);
1040    }
1041
1042    if (src1.type == BRW_REGISTER_TYPE_F ||
1043        (src1.file == BRW_IMMEDIATE_VALUE &&
1044         src1.type == BRW_REGISTER_TYPE_VF)) {
1045       assert(src0.type != BRW_REGISTER_TYPE_UD);
1046       assert(src0.type != BRW_REGISTER_TYPE_D);
1047    }
1048
1049    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1050           src0.nr != BRW_ARF_ACCUMULATOR);
1051    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1052           src1.nr != BRW_ARF_ACCUMULATOR);
1053
1054    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1055 }
1056
1057
1058 void brw_NOP(struct brw_compile *p)
1059 {
1060    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1061    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1062    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1063    brw_set_src1(p, insn, brw_imm_ud(0x0));
1064 }
1065
1066
1067
1068
1069
1070 /***********************************************************************
1071  * Comparisons, if/else/endif
1072  */
1073
1074 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1075                                  struct brw_reg dest,
1076                                  struct brw_reg src0,
1077                                  struct brw_reg src1)
1078 {
1079    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1080
1081    insn->header.execution_size = 1;
1082    insn->header.compression_control = BRW_COMPRESSION_NONE;
1083    insn->header.mask_control = BRW_MASK_DISABLE;
1084
1085    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1086
1087    return insn;
1088 }
1089
1090 static void
1091 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1092 {
1093    p->if_stack[p->if_stack_depth] = inst - p->store;
1094
1095    p->if_stack_depth++;
1096    if (p->if_stack_array_size <= p->if_stack_depth) {
1097       p->if_stack_array_size *= 2;
1098       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1099                              p->if_stack_array_size);
1100    }
1101 }
1102
1103 static struct brw_instruction *
1104 pop_if_stack(struct brw_compile *p)
1105 {
1106    p->if_stack_depth--;
1107    return &p->store[p->if_stack[p->if_stack_depth]];
1108 }
1109
1110 static void
1111 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1112 {
1113    if (p->loop_stack_array_size < p->loop_stack_depth) {
1114       p->loop_stack_array_size *= 2;
1115       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1116                                p->loop_stack_array_size);
1117       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1118                                      p->loop_stack_array_size);
1119    }
1120
1121    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1122    p->loop_stack_depth++;
1123    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1124 }
1125
1126 static struct brw_instruction *
1127 get_inner_do_insn(struct brw_compile *p)
1128 {
1129    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1130 }
1131
1132 /* EU takes the value from the flag register and pushes it onto some
1133  * sort of a stack (presumably merging with any flag value already on
1134  * the stack).  Within an if block, the flags at the top of the stack
1135  * control execution on each channel of the unit, eg. on each of the
1136  * 16 pixel values in our wm programs.
1137  *
1138  * When the matching 'else' instruction is reached (presumably by
1139  * countdown of the instruction count patched in by our ELSE/ENDIF
1140  * functions), the relevent flags are inverted.
1141  *
1142  * When the matching 'endif' instruction is reached, the flags are
1143  * popped off.  If the stack is now empty, normal execution resumes.
1144  */
1145 struct brw_instruction *
1146 brw_IF(struct brw_compile *p, GLuint execute_size)
1147 {
1148    struct intel_context *intel = &p->brw->intel;
1149    struct brw_instruction *insn;
1150
1151    insn = next_insn(p, BRW_OPCODE_IF);
1152
1153    /* Override the defaults for this instruction:
1154     */
1155    if (intel->gen < 6) {
1156       brw_set_dest(p, insn, brw_ip_reg());
1157       brw_set_src0(p, insn, brw_ip_reg());
1158       brw_set_src1(p, insn, brw_imm_d(0x0));
1159    } else if (intel->gen == 6) {
1160       brw_set_dest(p, insn, brw_imm_w(0));
1161       insn->bits1.branch_gen6.jump_count = 0;
1162       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1163       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1164    } else {
1165       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1166       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1167       brw_set_src1(p, insn, brw_imm_ud(0));
1168       insn->bits3.break_cont.jip = 0;
1169       insn->bits3.break_cont.uip = 0;
1170    }
1171
1172    insn->header.execution_size = execute_size;
1173    insn->header.compression_control = BRW_COMPRESSION_NONE;
1174    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1175    insn->header.mask_control = BRW_MASK_ENABLE;
1176    if (!p->single_program_flow)
1177       insn->header.thread_control = BRW_THREAD_SWITCH;
1178
1179    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1180
1181    push_if_stack(p, insn);
1182    p->if_depth_in_loop[p->loop_stack_depth]++;
1183    return insn;
1184 }
1185
1186 /* This function is only used for gen6-style IF instructions with an
1187  * embedded comparison (conditional modifier).  It is not used on gen7.
1188  */
1189 struct brw_instruction *
1190 gen6_IF(struct brw_compile *p, uint32_t conditional,
1191         struct brw_reg src0, struct brw_reg src1)
1192 {
1193    struct brw_instruction *insn;
1194
1195    insn = next_insn(p, BRW_OPCODE_IF);
1196
1197    brw_set_dest(p, insn, brw_imm_w(0));
1198    if (p->compressed) {
1199       insn->header.execution_size = BRW_EXECUTE_16;
1200    } else {
1201       insn->header.execution_size = BRW_EXECUTE_8;
1202    }
1203    insn->bits1.branch_gen6.jump_count = 0;
1204    brw_set_src0(p, insn, src0);
1205    brw_set_src1(p, insn, src1);
1206
1207    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1208    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1209    insn->header.destreg__conditionalmod = conditional;
1210
1211    if (!p->single_program_flow)
1212       insn->header.thread_control = BRW_THREAD_SWITCH;
1213
1214    push_if_stack(p, insn);
1215    return insn;
1216 }
1217
1218 /**
1219  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1220  */
1221 static void
1222 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1223                        struct brw_instruction *if_inst,
1224                        struct brw_instruction *else_inst)
1225 {
1226    /* The next instruction (where the ENDIF would be, if it existed) */
1227    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1228
1229    assert(p->single_program_flow);
1230    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1231    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1232    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1233
1234    /* Convert IF to an ADD instruction that moves the instruction pointer
1235     * to the first instruction of the ELSE block.  If there is no ELSE
1236     * block, point to where ENDIF would be.  Reverse the predicate.
1237     *
1238     * There's no need to execute an ENDIF since we don't need to do any
1239     * stack operations, and if we're currently executing, we just want to
1240     * continue normally.
1241     */
1242    if_inst->header.opcode = BRW_OPCODE_ADD;
1243    if_inst->header.predicate_inverse = 1;
1244
1245    if (else_inst != NULL) {
1246       /* Convert ELSE to an ADD instruction that points where the ENDIF
1247        * would be.
1248        */
1249       else_inst->header.opcode = BRW_OPCODE_ADD;
1250
1251       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1252       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1253    } else {
1254       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1255    }
1256 }
1257
1258 /**
1259  * Patch IF and ELSE instructions with appropriate jump targets.
1260  */
1261 static void
1262 patch_IF_ELSE(struct brw_compile *p,
1263               struct brw_instruction *if_inst,
1264               struct brw_instruction *else_inst,
1265               struct brw_instruction *endif_inst)
1266 {
1267    struct intel_context *intel = &p->brw->intel;
1268
1269    /* We shouldn't be patching IF and ELSE instructions in single program flow
1270     * mode when gen < 6, because in single program flow mode on those
1271     * platforms, we convert flow control instructions to conditional ADDs that
1272     * operate on IP (see brw_ENDIF).
1273     *
1274     * However, on Gen6, writing to IP doesn't work in single program flow mode
1275     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1276     * not be updated by non-flow control instructions.").  And on later
1277     * platforms, there is no significant benefit to converting control flow
1278     * instructions to conditional ADDs.  So we do patch IF and ELSE
1279     * instructions in single program flow mode on those platforms.
1280     */
1281    if (intel->gen < 6)
1282       assert(!p->single_program_flow);
1283
1284    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1285    assert(endif_inst != NULL);
1286    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1287
1288    unsigned br = 1;
1289    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1290     * requires 2 chunks.
1291     */
1292    if (intel->gen >= 5)
1293       br = 2;
1294
1295    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1296    endif_inst->header.execution_size = if_inst->header.execution_size;
1297
1298    if (else_inst == NULL) {
1299       /* Patch IF -> ENDIF */
1300       if (intel->gen < 6) {
1301          /* Turn it into an IFF, which means no mask stack operations for
1302           * all-false and jumping past the ENDIF.
1303           */
1304          if_inst->header.opcode = BRW_OPCODE_IFF;
1305          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1306          if_inst->bits3.if_else.pop_count = 0;
1307          if_inst->bits3.if_else.pad0 = 0;
1308       } else if (intel->gen == 6) {
1309          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1310          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1311       } else {
1312          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1313          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1314       }
1315    } else {
1316       else_inst->header.execution_size = if_inst->header.execution_size;
1317
1318       /* Patch IF -> ELSE */
1319       if (intel->gen < 6) {
1320          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1321          if_inst->bits3.if_else.pop_count = 0;
1322          if_inst->bits3.if_else.pad0 = 0;
1323       } else if (intel->gen == 6) {
1324          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1325       }
1326
1327       /* Patch ELSE -> ENDIF */
1328       if (intel->gen < 6) {
1329          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1330           * matching ENDIF.
1331           */
1332          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1333          else_inst->bits3.if_else.pop_count = 1;
1334          else_inst->bits3.if_else.pad0 = 0;
1335       } else if (intel->gen == 6) {
1336          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1337          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1338       } else {
1339          /* The IF instruction's JIP should point just past the ELSE */
1340          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1341          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1342          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1343          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1344       }
1345    }
1346 }
1347
1348 void
1349 brw_ELSE(struct brw_compile *p)
1350 {
1351    struct intel_context *intel = &p->brw->intel;
1352    struct brw_instruction *insn;
1353
1354    insn = next_insn(p, BRW_OPCODE_ELSE);
1355
1356    if (intel->gen < 6) {
1357       brw_set_dest(p, insn, brw_ip_reg());
1358       brw_set_src0(p, insn, brw_ip_reg());
1359       brw_set_src1(p, insn, brw_imm_d(0x0));
1360    } else if (intel->gen == 6) {
1361       brw_set_dest(p, insn, brw_imm_w(0));
1362       insn->bits1.branch_gen6.jump_count = 0;
1363       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1364       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1365    } else {
1366       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1367       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1368       brw_set_src1(p, insn, brw_imm_ud(0));
1369       insn->bits3.break_cont.jip = 0;
1370       insn->bits3.break_cont.uip = 0;
1371    }
1372
1373    insn->header.compression_control = BRW_COMPRESSION_NONE;
1374    insn->header.mask_control = BRW_MASK_ENABLE;
1375    if (!p->single_program_flow)
1376       insn->header.thread_control = BRW_THREAD_SWITCH;
1377
1378    push_if_stack(p, insn);
1379 }
1380
1381 void
1382 brw_ENDIF(struct brw_compile *p)
1383 {
1384    struct intel_context *intel = &p->brw->intel;
1385    struct brw_instruction *insn = NULL;
1386    struct brw_instruction *else_inst = NULL;
1387    struct brw_instruction *if_inst = NULL;
1388    struct brw_instruction *tmp;
1389    bool emit_endif = true;
1390
1391    /* In single program flow mode, we can express IF and ELSE instructions
1392     * equivalently as ADD instructions that operate on IP.  On platforms prior
1393     * to Gen6, flow control instructions cause an implied thread switch, so
1394     * this is a significant savings.
1395     *
1396     * However, on Gen6, writing to IP doesn't work in single program flow mode
1397     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1398     * not be updated by non-flow control instructions.").  And on later
1399     * platforms, there is no significant benefit to converting control flow
1400     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1401     * Gen5.
1402     */
1403    if (intel->gen < 6 && p->single_program_flow)
1404       emit_endif = false;
1405
1406    /*
1407     * A single next_insn() may change the base adress of instruction store
1408     * memory(p->store), so call it first before referencing the instruction
1409     * store pointer from an index
1410     */
1411    if (emit_endif)
1412       insn = next_insn(p, BRW_OPCODE_ENDIF);
1413
1414    /* Pop the IF and (optional) ELSE instructions from the stack */
1415    p->if_depth_in_loop[p->loop_stack_depth]--;
1416    tmp = pop_if_stack(p);
1417    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1418       else_inst = tmp;
1419       tmp = pop_if_stack(p);
1420    }
1421    if_inst = tmp;
1422
1423    if (!emit_endif) {
1424       /* ENDIF is useless; don't bother emitting it. */
1425       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1426       return;
1427    }
1428
1429    if (intel->gen < 6) {
1430       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1431       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1432       brw_set_src1(p, insn, brw_imm_d(0x0));
1433    } else if (intel->gen == 6) {
1434       brw_set_dest(p, insn, brw_imm_w(0));
1435       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1436       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1437    } else {
1438       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1439       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1440       brw_set_src1(p, insn, brw_imm_ud(0));
1441    }
1442
1443    insn->header.compression_control = BRW_COMPRESSION_NONE;
1444    insn->header.mask_control = BRW_MASK_ENABLE;
1445    insn->header.thread_control = BRW_THREAD_SWITCH;
1446
1447    /* Also pop item off the stack in the endif instruction: */
1448    if (intel->gen < 6) {
1449       insn->bits3.if_else.jump_count = 0;
1450       insn->bits3.if_else.pop_count = 1;
1451       insn->bits3.if_else.pad0 = 0;
1452    } else if (intel->gen == 6) {
1453       insn->bits1.branch_gen6.jump_count = 2;
1454    } else {
1455       insn->bits3.break_cont.jip = 2;
1456    }
1457    patch_IF_ELSE(p, if_inst, else_inst, insn);
1458 }
1459
1460 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1461 {
1462    struct intel_context *intel = &p->brw->intel;
1463    struct brw_instruction *insn;
1464
1465    insn = next_insn(p, BRW_OPCODE_BREAK);
1466    if (intel->gen >= 6) {
1467       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1468       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1469       brw_set_src1(p, insn, brw_imm_d(0x0));
1470    } else {
1471       brw_set_dest(p, insn, brw_ip_reg());
1472       brw_set_src0(p, insn, brw_ip_reg());
1473       brw_set_src1(p, insn, brw_imm_d(0x0));
1474       insn->bits3.if_else.pad0 = 0;
1475       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1476    }
1477    insn->header.compression_control = BRW_COMPRESSION_NONE;
1478    insn->header.execution_size = BRW_EXECUTE_8;
1479
1480    return insn;
1481 }
1482
1483 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1484 {
1485    struct brw_instruction *insn;
1486
1487    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1488    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1489    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1490    brw_set_dest(p, insn, brw_ip_reg());
1491    brw_set_src0(p, insn, brw_ip_reg());
1492    brw_set_src1(p, insn, brw_imm_d(0x0));
1493
1494    insn->header.compression_control = BRW_COMPRESSION_NONE;
1495    insn->header.execution_size = BRW_EXECUTE_8;
1496    return insn;
1497 }
1498
1499 struct brw_instruction *brw_CONT(struct brw_compile *p)
1500 {
1501    struct brw_instruction *insn;
1502    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1503    brw_set_dest(p, insn, brw_ip_reg());
1504    brw_set_src0(p, insn, brw_ip_reg());
1505    brw_set_src1(p, insn, brw_imm_d(0x0));
1506    insn->header.compression_control = BRW_COMPRESSION_NONE;
1507    insn->header.execution_size = BRW_EXECUTE_8;
1508    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1509    insn->bits3.if_else.pad0 = 0;
1510    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1511    return insn;
1512 }
1513
1514 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1515 {
1516    struct brw_instruction *insn;
1517
1518    insn = next_insn(p, BRW_OPCODE_HALT);
1519    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1520    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1521    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1522
1523    if (p->compressed) {
1524       insn->header.execution_size = BRW_EXECUTE_16;
1525    } else {
1526       insn->header.compression_control = BRW_COMPRESSION_NONE;
1527       insn->header.execution_size = BRW_EXECUTE_8;
1528    }
1529    return insn;
1530 }
1531
1532 /* DO/WHILE loop:
1533  *
1534  * The DO/WHILE is just an unterminated loop -- break or continue are
1535  * used for control within the loop.  We have a few ways they can be
1536  * done.
1537  *
1538  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1539  * jip and no DO instruction.
1540  *
1541  * For non-uniform control flow pre-gen6, there's a DO instruction to
1542  * push the mask, and a WHILE to jump back, and BREAK to get out and
1543  * pop the mask.
1544  *
1545  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1546  * just points back to the first instruction of the loop.
1547  */
1548 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1549 {
1550    struct intel_context *intel = &p->brw->intel;
1551
1552    if (intel->gen >= 6 || p->single_program_flow) {
1553       push_loop_stack(p, &p->store[p->nr_insn]);
1554       return &p->store[p->nr_insn];
1555    } else {
1556       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1557
1558       push_loop_stack(p, insn);
1559
1560       /* Override the defaults for this instruction:
1561        */
1562       brw_set_dest(p, insn, brw_null_reg());
1563       brw_set_src0(p, insn, brw_null_reg());
1564       brw_set_src1(p, insn, brw_null_reg());
1565
1566       insn->header.compression_control = BRW_COMPRESSION_NONE;
1567       insn->header.execution_size = execute_size;
1568       insn->header.predicate_control = BRW_PREDICATE_NONE;
1569       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1570       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1571
1572       return insn;
1573    }
1574 }
1575
1576 /**
1577  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1578  * instruction here.
1579  *
1580  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1581  * nesting, since it can always just point to the end of the block/current loop.
1582  */
1583 static void
1584 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1585 {
1586    struct intel_context *intel = &p->brw->intel;
1587    struct brw_instruction *do_inst = get_inner_do_insn(p);
1588    struct brw_instruction *inst;
1589    int br = (intel->gen == 5) ? 2 : 1;
1590
1591    for (inst = while_inst - 1; inst != do_inst; inst--) {
1592       /* If the jump count is != 0, that means that this instruction has already
1593        * been patched because it's part of a loop inside of the one we're
1594        * patching.
1595        */
1596       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1597           inst->bits3.if_else.jump_count == 0) {
1598          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1599       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1600                  inst->bits3.if_else.jump_count == 0) {
1601          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1602       }
1603    }
1604 }
1605
1606 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1607 {
1608    struct intel_context *intel = &p->brw->intel;
1609    struct brw_instruction *insn, *do_insn;
1610    GLuint br = 1;
1611
1612    if (intel->gen >= 5)
1613       br = 2;
1614
1615    if (intel->gen >= 7) {
1616       insn = next_insn(p, BRW_OPCODE_WHILE);
1617       do_insn = get_inner_do_insn(p);
1618
1619       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1620       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1621       brw_set_src1(p, insn, brw_imm_ud(0));
1622       insn->bits3.break_cont.jip = br * (do_insn - insn);
1623
1624       insn->header.execution_size = BRW_EXECUTE_8;
1625    } else if (intel->gen == 6) {
1626       insn = next_insn(p, BRW_OPCODE_WHILE);
1627       do_insn = get_inner_do_insn(p);
1628
1629       brw_set_dest(p, insn, brw_imm_w(0));
1630       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1631       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1632       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1633
1634       insn->header.execution_size = BRW_EXECUTE_8;
1635    } else {
1636       if (p->single_program_flow) {
1637          insn = next_insn(p, BRW_OPCODE_ADD);
1638          do_insn = get_inner_do_insn(p);
1639
1640          brw_set_dest(p, insn, brw_ip_reg());
1641          brw_set_src0(p, insn, brw_ip_reg());
1642          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1643          insn->header.execution_size = BRW_EXECUTE_1;
1644       } else {
1645          insn = next_insn(p, BRW_OPCODE_WHILE);
1646          do_insn = get_inner_do_insn(p);
1647
1648          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1649
1650          brw_set_dest(p, insn, brw_ip_reg());
1651          brw_set_src0(p, insn, brw_ip_reg());
1652          brw_set_src1(p, insn, brw_imm_d(0));
1653
1654          insn->header.execution_size = do_insn->header.execution_size;
1655          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1656          insn->bits3.if_else.pop_count = 0;
1657          insn->bits3.if_else.pad0 = 0;
1658
1659          brw_patch_break_cont(p, insn);
1660       }
1661    }
1662    insn->header.compression_control = BRW_COMPRESSION_NONE;
1663    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1664
1665    p->loop_stack_depth--;
1666
1667    return insn;
1668 }
1669
1670
1671 /* FORWARD JUMPS:
1672  */
1673 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1674 {
1675    struct intel_context *intel = &p->brw->intel;
1676    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1677    GLuint jmpi = 1;
1678
1679    if (intel->gen >= 5)
1680       jmpi = 2;
1681
1682    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1683    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1684
1685    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1686 }
1687
1688
1689
1690 /* To integrate with the above, it makes sense that the comparison
1691  * instruction should populate the flag register.  It might be simpler
1692  * just to use the flag reg for most WM tasks?
1693  */
1694 void brw_CMP(struct brw_compile *p,
1695              struct brw_reg dest,
1696              GLuint conditional,
1697              struct brw_reg src0,
1698              struct brw_reg src1)
1699 {
1700    struct intel_context *intel = &p->brw->intel;
1701    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1702
1703    insn->header.destreg__conditionalmod = conditional;
1704    brw_set_dest(p, insn, dest);
1705    brw_set_src0(p, insn, src0);
1706    brw_set_src1(p, insn, src1);
1707
1708 /*    guess_execution_size(insn, src0); */
1709
1710
1711    /* Make it so that future instructions will use the computed flag
1712     * value until brw_set_predicate_control_flag_value() is called
1713     * again.
1714     */
1715    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1716        dest.nr == 0) {
1717       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1718       p->flag_value = 0xff;
1719    }
1720
1721    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1722     * page says:
1723     *    "Any CMP instruction with a null destination must use a {switch}."
1724     *
1725     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1726     * mentioned on their work-arounds pages.
1727     */
1728    if (intel->gen == 7) {
1729       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1730           dest.nr == BRW_ARF_NULL) {
1731          insn->header.thread_control = BRW_THREAD_SWITCH;
1732       }
1733    }
1734 }
1735
1736 /* Issue 'wait' instruction for n1, host could program MMIO
1737    to wake up thread. */
1738 void brw_WAIT (struct brw_compile *p)
1739 {
1740    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1741    struct brw_reg src = brw_notification_1_reg();
1742
1743    brw_set_dest(p, insn, src);
1744    brw_set_src0(p, insn, src);
1745    brw_set_src1(p, insn, brw_null_reg());
1746    insn->header.execution_size = 0; /* must */
1747    insn->header.predicate_control = 0;
1748    insn->header.compression_control = 0;
1749 }
1750
1751
1752 /***********************************************************************
1753  * Helpers for the various SEND message types:
1754  */
1755
1756 /** Extended math function, float[8].
1757  */
1758 void brw_math( struct brw_compile *p,
1759                struct brw_reg dest,
1760                GLuint function,
1761                GLuint msg_reg_nr,
1762                struct brw_reg src,
1763                GLuint data_type,
1764                GLuint precision )
1765 {
1766    struct intel_context *intel = &p->brw->intel;
1767
1768    if (intel->gen >= 6) {
1769       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1770
1771       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1772              (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1773       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1774
1775       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1776       if (intel->gen == 6)
1777          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1778
1779       /* Source modifiers are ignored for extended math instructions on Gen6. */
1780       if (intel->gen == 6) {
1781          assert(!src.negate);
1782          assert(!src.abs);
1783       }
1784
1785       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1786           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1787           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1788          assert(src.type != BRW_REGISTER_TYPE_F);
1789       } else {
1790          assert(src.type == BRW_REGISTER_TYPE_F);
1791       }
1792
1793       /* Math is the same ISA format as other opcodes, except that CondModifier
1794        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1795        */
1796       insn->header.destreg__conditionalmod = function;
1797
1798       brw_set_dest(p, insn, dest);
1799       brw_set_src0(p, insn, src);
1800       brw_set_src1(p, insn, brw_null_reg());
1801    } else {
1802       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1803
1804       /* Example code doesn't set predicate_control for send
1805        * instructions.
1806        */
1807       insn->header.predicate_control = 0;
1808       insn->header.destreg__conditionalmod = msg_reg_nr;
1809
1810       brw_set_dest(p, insn, dest);
1811       brw_set_src0(p, insn, src);
1812       brw_set_math_message(p,
1813                            insn,
1814                            function,
1815                            src.type == BRW_REGISTER_TYPE_D,
1816                            precision,
1817                            data_type);
1818    }
1819 }
1820
1821 /** Extended math function, float[8].
1822  */
1823 void brw_math2(struct brw_compile *p,
1824                struct brw_reg dest,
1825                GLuint function,
1826                struct brw_reg src0,
1827                struct brw_reg src1)
1828 {
1829    struct intel_context *intel = &p->brw->intel;
1830    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1831
1832    assert(intel->gen >= 6);
1833    (void) intel;
1834
1835
1836    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1837           (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1838    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1839    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1840
1841    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1842    if (intel->gen == 6) {
1843       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1844       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1845    }
1846
1847    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1848        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1849        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1850       assert(src0.type != BRW_REGISTER_TYPE_F);
1851       assert(src1.type != BRW_REGISTER_TYPE_F);
1852    } else {
1853       assert(src0.type == BRW_REGISTER_TYPE_F);
1854       assert(src1.type == BRW_REGISTER_TYPE_F);
1855    }
1856
1857    /* Source modifiers are ignored for extended math instructions on Gen6. */
1858    if (intel->gen == 6) {
1859       assert(!src0.negate);
1860       assert(!src0.abs);
1861       assert(!src1.negate);
1862       assert(!src1.abs);
1863    }
1864
1865    /* Math is the same ISA format as other opcodes, except that CondModifier
1866     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1867     */
1868    insn->header.destreg__conditionalmod = function;
1869
1870    brw_set_dest(p, insn, dest);
1871    brw_set_src0(p, insn, src0);
1872    brw_set_src1(p, insn, src1);
1873 }
1874
1875
1876 /**
1877  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1878  * using a constant offset per channel.
1879  *
1880  * The offset must be aligned to oword size (16 bytes).  Used for
1881  * register spilling.
1882  */
1883 void brw_oword_block_write_scratch(struct brw_compile *p,
1884                                    struct brw_reg mrf,
1885                                    int num_regs,
1886                                    GLuint offset)
1887 {
1888    struct intel_context *intel = &p->brw->intel;
1889    uint32_t msg_control, msg_type;
1890    int mlen;
1891
1892    if (intel->gen >= 6)
1893       offset /= 16;
1894
1895    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1896
1897    if (num_regs == 1) {
1898       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1899       mlen = 2;
1900    } else {
1901       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1902       mlen = 3;
1903    }
1904
1905    /* Set up the message header.  This is g0, with g0.2 filled with
1906     * the offset.  We don't want to leave our offset around in g0 or
1907     * it'll screw up texture samples, so set it up inside the message
1908     * reg.
1909     */
1910    {
1911       brw_push_insn_state(p);
1912       brw_set_mask_control(p, BRW_MASK_DISABLE);
1913       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1914
1915       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1916
1917       /* set message header global offset field (reg 0, element 2) */
1918       brw_MOV(p,
1919               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1920                                   mrf.nr,
1921                                   2), BRW_REGISTER_TYPE_UD),
1922               brw_imm_ud(offset));
1923
1924       brw_pop_insn_state(p);
1925    }
1926
1927    {
1928       struct brw_reg dest;
1929       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1930       int send_commit_msg;
1931       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1932                                          BRW_REGISTER_TYPE_UW);
1933
1934       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1935          insn->header.compression_control = BRW_COMPRESSION_NONE;
1936          src_header = vec16(src_header);
1937       }
1938       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1939       insn->header.destreg__conditionalmod = mrf.nr;
1940
1941       /* Until gen6, writes followed by reads from the same location
1942        * are not guaranteed to be ordered unless write_commit is set.
1943        * If set, then a no-op write is issued to the destination
1944        * register to set a dependency, and a read from the destination
1945        * can be used to ensure the ordering.
1946        *
1947        * For gen6, only writes between different threads need ordering
1948        * protection.  Our use of DP writes is all about register
1949        * spilling within a thread.
1950        */
1951       if (intel->gen >= 6) {
1952          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1953          send_commit_msg = 0;
1954       } else {
1955          dest = src_header;
1956          send_commit_msg = 1;
1957       }
1958
1959       brw_set_dest(p, insn, dest);
1960       if (intel->gen >= 6) {
1961          brw_set_src0(p, insn, mrf);
1962       } else {
1963          brw_set_src0(p, insn, brw_null_reg());
1964       }
1965
1966       if (intel->gen >= 6)
1967          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1968       else
1969          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1970
1971       brw_set_dp_write_message(p,
1972                                insn,
1973                                255, /* binding table index (255=stateless) */
1974                                msg_control,
1975                                msg_type,
1976                                mlen,
1977                                true, /* header_present */
1978                                0, /* not a render target */
1979                                send_commit_msg, /* response_length */
1980                                0, /* eot */
1981                                send_commit_msg);
1982    }
1983 }
1984
1985
1986 /**
1987  * Read a block of owords (half a GRF each) from the scratch buffer
1988  * using a constant index per channel.
1989  *
1990  * Offset must be aligned to oword size (16 bytes).  Used for register
1991  * spilling.
1992  */
1993 void
1994 brw_oword_block_read_scratch(struct brw_compile *p,
1995                              struct brw_reg dest,
1996                              struct brw_reg mrf,
1997                              int num_regs,
1998                              GLuint offset)
1999 {
2000    struct intel_context *intel = &p->brw->intel;
2001    uint32_t msg_control;
2002    int rlen;
2003
2004    if (intel->gen >= 6)
2005       offset /= 16;
2006
2007    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2008    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2009
2010    if (num_regs == 1) {
2011       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2012       rlen = 1;
2013    } else {
2014       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2015       rlen = 2;
2016    }
2017
2018    {
2019       brw_push_insn_state(p);
2020       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2021       brw_set_mask_control(p, BRW_MASK_DISABLE);
2022
2023       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2024
2025       /* set message header global offset field (reg 0, element 2) */
2026       brw_MOV(p,
2027               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2028                                   mrf.nr,
2029                                   2), BRW_REGISTER_TYPE_UD),
2030               brw_imm_ud(offset));
2031
2032       brw_pop_insn_state(p);
2033    }
2034
2035    {
2036       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2037
2038       assert(insn->header.predicate_control == 0);
2039       insn->header.compression_control = BRW_COMPRESSION_NONE;
2040       insn->header.destreg__conditionalmod = mrf.nr;
2041
2042       brw_set_dest(p, insn, dest);      /* UW? */
2043       if (intel->gen >= 6) {
2044          brw_set_src0(p, insn, mrf);
2045       } else {
2046          brw_set_src0(p, insn, brw_null_reg());
2047       }
2048
2049       brw_set_dp_read_message(p,
2050                               insn,
2051                               255, /* binding table index (255=stateless) */
2052                               msg_control,
2053                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2054                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2055                               1, /* msg_length */
2056                               true, /* header_present */
2057                               rlen);
2058    }
2059 }
2060
2061 /**
2062  * Read a float[4] vector from the data port Data Cache (const buffer).
2063  * Location (in buffer) should be a multiple of 16.
2064  * Used for fetching shader constants.
2065  */
2066 void brw_oword_block_read(struct brw_compile *p,
2067                           struct brw_reg dest,
2068                           struct brw_reg mrf,
2069                           uint32_t offset,
2070                           uint32_t bind_table_index)
2071 {
2072    struct intel_context *intel = &p->brw->intel;
2073
2074    /* On newer hardware, offset is in units of owords. */
2075    if (intel->gen >= 6)
2076       offset /= 16;
2077
2078    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2079
2080    brw_push_insn_state(p);
2081    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2082    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2083    brw_set_mask_control(p, BRW_MASK_DISABLE);
2084
2085    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2086
2087    /* set message header global offset field (reg 0, element 2) */
2088    brw_MOV(p,
2089            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2090                                mrf.nr,
2091                                2), BRW_REGISTER_TYPE_UD),
2092            brw_imm_ud(offset));
2093
2094    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2095    insn->header.destreg__conditionalmod = mrf.nr;
2096
2097    /* cast dest to a uword[8] vector */
2098    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2099
2100    brw_set_dest(p, insn, dest);
2101    if (intel->gen >= 6) {
2102       brw_set_src0(p, insn, mrf);
2103    } else {
2104       brw_set_src0(p, insn, brw_null_reg());
2105    }
2106
2107    brw_set_dp_read_message(p,
2108                            insn,
2109                            bind_table_index,
2110                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2111                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2112                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2113                            1, /* msg_length */
2114                            true, /* header_present */
2115                            1); /* response_length (1 reg, 2 owords!) */
2116
2117    brw_pop_insn_state(p);
2118 }
2119
2120
2121 void brw_fb_WRITE(struct brw_compile *p,
2122                   int dispatch_width,
2123                   GLuint msg_reg_nr,
2124                   struct brw_reg src0,
2125                   GLuint msg_control,
2126                   GLuint binding_table_index,
2127                   GLuint msg_length,
2128                   GLuint response_length,
2129                   bool eot,
2130                   bool header_present)
2131 {
2132    struct intel_context *intel = &p->brw->intel;
2133    struct brw_instruction *insn;
2134    GLuint msg_type;
2135    struct brw_reg dest;
2136
2137    if (dispatch_width == 16)
2138       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2139    else
2140       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2141
2142    if (intel->gen >= 6) {
2143       insn = next_insn(p, BRW_OPCODE_SENDC);
2144    } else {
2145       insn = next_insn(p, BRW_OPCODE_SEND);
2146    }
2147    /* The execution mask is ignored for render target writes. */
2148    insn->header.predicate_control = 0;
2149    insn->header.compression_control = BRW_COMPRESSION_NONE;
2150
2151    if (intel->gen >= 6) {
2152       /* headerless version, just submit color payload */
2153       src0 = brw_message_reg(msg_reg_nr);
2154
2155       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2156    } else {
2157       insn->header.destreg__conditionalmod = msg_reg_nr;
2158
2159       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2160    }
2161
2162    brw_set_dest(p, insn, dest);
2163    brw_set_src0(p, insn, src0);
2164    brw_set_dp_write_message(p,
2165                             insn,
2166                             binding_table_index,
2167                             msg_control,
2168                             msg_type,
2169                             msg_length,
2170                             header_present,
2171                             eot, /* last render target write */
2172                             response_length,
2173                             eot,
2174                             0 /* send_commit_msg */);
2175 }
2176
2177
2178 /**
2179  * Texture sample instruction.
2180  * Note: the msg_type plus msg_length values determine exactly what kind
2181  * of sampling operation is performed.  See volume 4, page 161 of docs.
2182  */
2183 void brw_SAMPLE(struct brw_compile *p,
2184                 struct brw_reg dest,
2185                 GLuint msg_reg_nr,
2186                 struct brw_reg src0,
2187                 GLuint binding_table_index,
2188                 GLuint sampler,
2189                 GLuint msg_type,
2190                 GLuint response_length,
2191                 GLuint msg_length,
2192                 GLuint header_present,
2193                 GLuint simd_mode,
2194                 GLuint return_format)
2195 {
2196    struct intel_context *intel = &p->brw->intel;
2197    struct brw_instruction *insn;
2198
2199    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2200
2201    insn = next_insn(p, BRW_OPCODE_SEND);
2202    insn->header.predicate_control = 0; /* XXX */
2203    insn->header.compression_control = BRW_COMPRESSION_NONE;
2204    if (intel->gen < 6)
2205       insn->header.destreg__conditionalmod = msg_reg_nr;
2206
2207    brw_set_dest(p, insn, dest);
2208    brw_set_src0(p, insn, src0);
2209    brw_set_sampler_message(p, insn,
2210                            binding_table_index,
2211                            sampler,
2212                            msg_type,
2213                            response_length,
2214                            msg_length,
2215                            header_present,
2216                            simd_mode,
2217                            return_format);
2218 }
2219
2220 /* All these variables are pretty confusing - we might be better off
2221  * using bitmasks and macros for this, in the old style.  Or perhaps
2222  * just having the caller instantiate the fields in dword3 itself.
2223  */
2224 void brw_urb_WRITE(struct brw_compile *p,
2225                    struct brw_reg dest,
2226                    GLuint msg_reg_nr,
2227                    struct brw_reg src0,
2228                    bool allocate,
2229                    bool used,
2230                    GLuint msg_length,
2231                    GLuint response_length,
2232                    bool eot,
2233                    bool writes_complete,
2234                    GLuint offset,
2235                    GLuint swizzle)
2236 {
2237    struct intel_context *intel = &p->brw->intel;
2238    struct brw_instruction *insn;
2239
2240    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2241
2242    if (intel->gen == 7) {
2243       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2244       brw_push_insn_state(p);
2245       brw_set_access_mode(p, BRW_ALIGN_1);
2246       brw_set_mask_control(p, BRW_MASK_DISABLE);
2247       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2248                        BRW_REGISTER_TYPE_UD),
2249                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2250                 brw_imm_ud(0xff00));
2251       brw_pop_insn_state(p);
2252    }
2253
2254    insn = next_insn(p, BRW_OPCODE_SEND);
2255
2256    assert(msg_length < BRW_MAX_MRF);
2257
2258    brw_set_dest(p, insn, dest);
2259    brw_set_src0(p, insn, src0);
2260    brw_set_src1(p, insn, brw_imm_d(0));
2261
2262    if (intel->gen < 6)
2263       insn->header.destreg__conditionalmod = msg_reg_nr;
2264
2265    brw_set_urb_message(p,
2266                        insn,
2267                        allocate,
2268                        used,
2269                        msg_length,
2270                        response_length,
2271                        eot,
2272                        writes_complete,
2273                        offset,
2274                        swizzle);
2275 }
2276
2277 static int
2278 next_ip(struct brw_compile *p, int ip)
2279 {
2280    struct brw_instruction *insn = (void *)p->store + ip;
2281
2282    if (insn->header.cmpt_control)
2283       return ip + 8;
2284    else
2285       return ip + 16;
2286 }
2287
2288 static int
2289 brw_find_next_block_end(struct brw_compile *p, int start)
2290 {
2291    int ip;
2292    void *store = p->store;
2293
2294    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2295       struct brw_instruction *insn = store + ip;
2296
2297       switch (insn->header.opcode) {
2298       case BRW_OPCODE_ENDIF:
2299       case BRW_OPCODE_ELSE:
2300       case BRW_OPCODE_WHILE:
2301       case BRW_OPCODE_HALT:
2302          return ip;
2303       }
2304    }
2305
2306    return 0;
2307 }
2308
2309 /* There is no DO instruction on gen6, so to find the end of the loop
2310  * we have to see if the loop is jumping back before our start
2311  * instruction.
2312  */
2313 static int
2314 brw_find_loop_end(struct brw_compile *p, int start)
2315 {
2316    struct intel_context *intel = &p->brw->intel;
2317    int ip;
2318    int scale = 8;
2319    void *store = p->store;
2320
2321    /* Always start after the instruction (such as a WHILE) we're trying to fix
2322     * up.
2323     */
2324    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2325       struct brw_instruction *insn = store + ip;
2326
2327       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2328          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2329                                    : insn->bits3.break_cont.jip;
2330          if (ip + jip * scale <= start)
2331             return ip;
2332       }
2333    }
2334    assert(!"not reached");
2335    return start;
2336 }
2337
2338 /* After program generation, go back and update the UIP and JIP of
2339  * BREAK, CONT, and HALT instructions to their correct locations.
2340  */
2341 void
2342 brw_set_uip_jip(struct brw_compile *p)
2343 {
2344    struct intel_context *intel = &p->brw->intel;
2345    int ip;
2346    int scale = 8;
2347    void *store = p->store;
2348
2349    if (intel->gen < 6)
2350       return;
2351
2352    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2353       struct brw_instruction *insn = store + ip;
2354
2355       if (insn->header.cmpt_control) {
2356          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2357          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2358                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2359                 insn->header.opcode != BRW_OPCODE_HALT);
2360          continue;
2361       }
2362
2363       int block_end_ip = brw_find_next_block_end(p, ip);
2364       switch (insn->header.opcode) {
2365       case BRW_OPCODE_BREAK:
2366          assert(block_end_ip != 0);
2367          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2368          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2369          insn->bits3.break_cont.uip =
2370             (brw_find_loop_end(p, ip) - ip +
2371              (intel->gen == 6 ? 16 : 0)) / scale;
2372          break;
2373       case BRW_OPCODE_CONTINUE:
2374          assert(block_end_ip != 0);
2375          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2376          insn->bits3.break_cont.uip =
2377             (brw_find_loop_end(p, ip) - ip) / scale;
2378
2379          assert(insn->bits3.break_cont.uip != 0);
2380          assert(insn->bits3.break_cont.jip != 0);
2381          break;
2382
2383       case BRW_OPCODE_ENDIF:
2384          if (block_end_ip == 0)
2385             insn->bits3.break_cont.jip = 2;
2386          else
2387             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2388          break;
2389
2390       case BRW_OPCODE_HALT:
2391          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2392           *
2393           *    "In case of the halt instruction not inside any conditional
2394           *     code block, the value of <JIP> and <UIP> should be the
2395           *     same. In case of the halt instruction inside conditional code
2396           *     block, the <UIP> should be the end of the program, and the
2397           *     <JIP> should be end of the most inner conditional code block."
2398           *
2399           * The uip will have already been set by whoever set up the
2400           * instruction.
2401           */
2402          if (block_end_ip == 0) {
2403             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2404          } else {
2405             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2406          }
2407          assert(insn->bits3.break_cont.uip != 0);
2408          assert(insn->bits3.break_cont.jip != 0);
2409          break;
2410       }
2411    }
2412 }
2413
2414 void brw_ff_sync(struct brw_compile *p,
2415                    struct brw_reg dest,
2416                    GLuint msg_reg_nr,
2417                    struct brw_reg src0,
2418                    bool allocate,
2419                    GLuint response_length,
2420                    bool eot)
2421 {
2422    struct intel_context *intel = &p->brw->intel;
2423    struct brw_instruction *insn;
2424
2425    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2426
2427    insn = next_insn(p, BRW_OPCODE_SEND);
2428    brw_set_dest(p, insn, dest);
2429    brw_set_src0(p, insn, src0);
2430    brw_set_src1(p, insn, brw_imm_d(0));
2431
2432    if (intel->gen < 6)
2433       insn->header.destreg__conditionalmod = msg_reg_nr;
2434
2435    brw_set_ff_sync_message(p,
2436                            insn,
2437                            allocate,
2438                            response_length,
2439                            eot);
2440 }
2441
2442 /**
2443  * Emit the SEND instruction necessary to generate stream output data on Gen6
2444  * (for transform feedback).
2445  *
2446  * If send_commit_msg is true, this is the last piece of stream output data
2447  * from this thread, so send the data as a committed write.  According to the
2448  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2449  *
2450  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2451  *   writes are complete by sending the final write as a committed write."
2452  */
2453 void
2454 brw_svb_write(struct brw_compile *p,
2455               struct brw_reg dest,
2456               GLuint msg_reg_nr,
2457               struct brw_reg src0,
2458               GLuint binding_table_index,
2459               bool   send_commit_msg)
2460 {
2461    struct brw_instruction *insn;
2462
2463    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2464
2465    insn = next_insn(p, BRW_OPCODE_SEND);
2466    brw_set_dest(p, insn, dest);
2467    brw_set_src0(p, insn, src0);
2468    brw_set_src1(p, insn, brw_imm_d(0));
2469    brw_set_dp_write_message(p, insn,
2470                             binding_table_index,
2471                             0, /* msg_control: ignored */
2472                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2473                             1, /* msg_length */
2474                             true, /* header_present */
2475                             0, /* last_render_target: ignored */
2476                             send_commit_msg, /* response_length */
2477                             0, /* end_of_thread */
2478                             send_commit_msg); /* send_commit_msg */
2479 }
2480
2481 /**
2482  * This instruction is generated as a single-channel align1 instruction by
2483  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2484  *
2485  * We can't use the typed atomic op in the FS because that has the execution
2486  * mask ANDed with the pixel mask, but we just want to write the one dword for
2487  * all the pixels.
2488  *
2489  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2490  * one u32.  So we use the same untyped atomic write message as the pixel
2491  * shader.
2492  *
2493  * The untyped atomic operation requires a BUFFER surface type with RAW
2494  * format, and is only accessible through the legacy DATA_CACHE dataport
2495  * messages.
2496  */
2497 void brw_shader_time_add(struct brw_compile *p,
2498                          struct brw_reg payload,
2499                          uint32_t surf_index)
2500 {
2501    struct intel_context *intel = &p->brw->intel;
2502    assert(intel->gen >= 7);
2503
2504    brw_push_insn_state(p);
2505    brw_set_access_mode(p, BRW_ALIGN_1);
2506    brw_set_mask_control(p, BRW_MASK_DISABLE);
2507    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2508    brw_pop_insn_state(p);
2509
2510    /* We use brw_vec1_reg and unmasked because we want to increment the given
2511     * offset only once.
2512     */
2513    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2514                                       BRW_ARF_NULL, 0));
2515    brw_set_src0(p, send, brw_vec1_reg(payload.file,
2516                                       payload.nr, 0));
2517
2518    uint32_t sfid, msg_type;
2519    if (intel->is_haswell) {
2520       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2521       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2522    } else {
2523       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2524       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2525    }
2526
2527    bool header_present = false;
2528    bool eot = false;
2529    uint32_t mlen = 2; /* offset, value */
2530    uint32_t rlen = 0;
2531    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2532
2533    send->bits3.ud |= msg_type << 14;
2534    send->bits3.ud |= 0 << 13; /* no return data */
2535    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2536    send->bits3.ud |= BRW_AOP_ADD << 8;
2537    send->bits3.ud |= surf_index << 0;
2538 }