src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct brw_context *brw = p->brw;
  67    if (brw->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
  88     * "The send with EOT should use register space R112-R127 for <src>. This is
  89     *  to enable loading of a new thread into the same slot while the message
  90     *  with EOT for current thread is pending dispatch."
  91     *
  92     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  93     * registers required for messages with EOT.
  94     */
  95    struct brw_context *brw = p->brw;
  96    if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  97       reg->file = BRW_GENERAL_REGISTER_FILE;
  98       reg->nr += GEN7_MRF_HACK_START;
  99    }
 100 }
 101
 102
 103 void
 104 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
 105              struct brw_reg dest)
 106 {
 107    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 108        dest.file != BRW_MESSAGE_REGISTER_FILE)
 109       assert(dest.nr < 128);
 110
 111    gen7_convert_mrf_to_grf(p, &dest);
 112
 113    insn->bits1.da1.dest_reg_file = dest.file;
 114    insn->bits1.da1.dest_reg_type = dest.type;
 115    insn->bits1.da1.dest_address_mode = dest.address_mode;
 116
 117    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 118       insn->bits1.da1.dest_reg_nr = dest.nr;
 119
 120       if (insn->header.access_mode == BRW_ALIGN_1) {
 121          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 122          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 123             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 124          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 125       }
 126       else {
 127          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 128          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 129          assert(dest.dw1.bits.writemask != 0 ||
 130                 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
 131          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 132           *    Although Dst.HorzStride is a don't care for Align16, HW needs
 133           *    this to be programmed as "01".
 134           */
 135          insn->bits1.da16.dest_horiz_stride = 1;
 136       }
 137    }
 138    else {
 139       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 140
 141       /* These are different sizes in align1 vs align16:
 142        */
 143       if (insn->header.access_mode == BRW_ALIGN_1) {
 144          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 145          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 146             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 147          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 148       }
 149       else {
 150          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 151          /* even ignored in da16, still need to set as '01' */
 152          insn->bits1.ia16.dest_horiz_stride = 1;
 153       }
 154    }
 155
 156    /* NEW: Set the execution size based on dest.width and
 157     * insn->compression_control:
 158     */
 159    guess_execution_size(p, insn, dest);
 160 }
 161
 162 extern int reg_type_size[];
 163
 164 static void
 165 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 166 {
 167    int hstride_for_reg[] = {0, 1, 2, 4};
 168    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 169    int width_for_reg[] = {1, 2, 4, 8, 16};
 170    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 171    int width, hstride, vstride, execsize;
 172
 173    if (reg.file == BRW_IMMEDIATE_VALUE) {
 174       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 175        * mean the destination has to be 128-bit aligned and the
 176        * destination horiz stride has to be a word.
 177        */
 178       if (reg.type == BRW_REGISTER_TYPE_V) {
 179          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 180                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 181       }
 182
 183       return;
 184    }
 185
 186    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 187        reg.file == BRW_ARF_NULL)
 188       return;
 189
 190    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 191    hstride = hstride_for_reg[reg.hstride];
 192
 193    if (reg.vstride == 0xf) {
 194       vstride = -1;
 195    } else {
 196       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 197       vstride = vstride_for_reg[reg.vstride];
 198    }
 199
 200    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 201    width = width_for_reg[reg.width];
 202
 203    assert(insn->header.execution_size >= 0 &&
 204           insn->header.execution_size < Elements(execsize_for_reg));
 205    execsize = execsize_for_reg[insn->header.execution_size];
 206
 207    /* Restrictions from 3.3.10: Register Region Restrictions. */
 208    /* 3. */
 209    assert(execsize >= width);
 210
 211    /* 4. */
 212    if (execsize == width && hstride != 0) {
 213       assert(vstride == -1 || vstride == width * hstride);
 214    }
 215
 216    /* 5. */
 217    if (execsize == width && hstride == 0) {
 218       /* no restriction on vstride. */
 219    }
 220
 221    /* 6. */
 222    if (width == 1) {
 223       assert(hstride == 0);
 224    }
 225
 226    /* 7. */
 227    if (execsize == 1 && width == 1) {
 228       assert(hstride == 0);
 229       assert(vstride == 0);
 230    }
 231
 232    /* 8. */
 233    if (vstride == 0 && hstride == 0) {
 234       assert(width == 1);
 235    }
 236
 237    /* 10. Check destination issues. */
 238 }
 239
 240 void
 241 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 242              struct brw_reg reg)
 243 {
 244    struct brw_context *brw = p->brw;
 245
 246    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 247       assert(reg.nr < 128);
 248
 249    gen7_convert_mrf_to_grf(p, &reg);
 250
 251    if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
 252                            insn->header.opcode == BRW_OPCODE_SENDC)) {
 253       /* Any source modifiers or regions will be ignored, since this just
 254        * identifies the MRF/GRF to start reading the message contents from.
 255        * Check for some likely failures.
 256        */
 257       assert(!reg.negate);
 258       assert(!reg.abs);
 259       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
 260    }
 261
 262    validate_reg(insn, reg);
 263
 264    insn->bits1.da1.src0_reg_file = reg.file;
 265    insn->bits1.da1.src0_reg_type = reg.type;
 266    insn->bits2.da1.src0_abs = reg.abs;
 267    insn->bits2.da1.src0_negate = reg.negate;
 268    insn->bits2.da1.src0_address_mode = reg.address_mode;
 269
 270    if (reg.file == BRW_IMMEDIATE_VALUE) {
 271       insn->bits3.ud = reg.dw1.ud;
 272
 273       /* Required to set some fields in src1 as well:
 274        */
 275       insn->bits1.da1.src1_reg_file = 0; /* arf */
 276       insn->bits1.da1.src1_reg_type = reg.type;
 277    }
 278    else
 279    {
 280       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 281          if (insn->header.access_mode == BRW_ALIGN_1) {
 282             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 283             insn->bits2.da1.src0_reg_nr = reg.nr;
 284          }
 285          else {
 286             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 287             insn->bits2.da16.src0_reg_nr = reg.nr;
 288          }
 289       }
 290       else {
 291          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 292
 293          if (insn->header.access_mode == BRW_ALIGN_1) {
 294             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 295          }
 296          else {
 297             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 298          }
 299       }
 300
 301       if (insn->header.access_mode == BRW_ALIGN_1) {
 302          if (reg.width == BRW_WIDTH_1 &&
 303              insn->header.execution_size == BRW_EXECUTE_1) {
 304             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 305             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 306             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 307          }
 308          else {
 309             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 310             insn->bits2.da1.src0_width = reg.width;
 311             insn->bits2.da1.src0_vert_stride = reg.vstride;
 312          }
 313       }
 314       else {
 315          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 316          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 317          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 318          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 319
 320          /* This is an oddity of the fact we're using the same
 321           * descriptions for registers in align_16 as align_1:
 322           */
 323          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 324             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 325          else
 326             insn->bits2.da16.src0_vert_stride = reg.vstride;
 327       }
 328    }
 329 }
 330
 331
 332 void brw_set_src1(struct brw_compile *p,
 333                   struct brw_instruction *insn,
 334                   struct brw_reg reg)
 335 {
 336    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 337
 338    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 339       assert(reg.nr < 128);
 340
 341    gen7_convert_mrf_to_grf(p, &reg);
 342
 343    validate_reg(insn, reg);
 344
 345    insn->bits1.da1.src1_reg_file = reg.file;
 346    insn->bits1.da1.src1_reg_type = reg.type;
 347    insn->bits3.da1.src1_abs = reg.abs;
 348    insn->bits3.da1.src1_negate = reg.negate;
 349
 350    /* Only src1 can be immediate in two-argument instructions.
 351     */
 352    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 353
 354    if (reg.file == BRW_IMMEDIATE_VALUE) {
 355       insn->bits3.ud = reg.dw1.ud;
 356    }
 357    else {
 358       /* This is a hardware restriction, which may or may not be lifted
 359        * in the future:
 360        */
 361       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 362       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 363
 364       if (insn->header.access_mode == BRW_ALIGN_1) {
 365          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 366          insn->bits3.da1.src1_reg_nr = reg.nr;
 367       }
 368       else {
 369          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 370          insn->bits3.da16.src1_reg_nr = reg.nr;
 371       }
 372
 373       if (insn->header.access_mode == BRW_ALIGN_1) {
 374          if (reg.width == BRW_WIDTH_1 &&
 375              insn->header.execution_size == BRW_EXECUTE_1) {
 376             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 377             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 378             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 379          }
 380          else {
 381             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 382             insn->bits3.da1.src1_width = reg.width;
 383             insn->bits3.da1.src1_vert_stride = reg.vstride;
 384          }
 385       }
 386       else {
 387          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 388          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 389          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 390          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 391
 392          /* This is an oddity of the fact we're using the same
 393           * descriptions for registers in align_16 as align_1:
 394           */
 395          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 396             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 397          else
 398             insn->bits3.da16.src1_vert_stride = reg.vstride;
 399       }
 400    }
 401 }
 402
 403 /**
 404  * Set the Message Descriptor and Extended Message Descriptor fields
 405  * for SEND messages.
 406  *
 407  * \note This zeroes out the Function Control bits, so it must be called
 408  *       \b before filling out any message-specific data.  Callers can
 409  *       choose not to fill in irrelevant bits; they will be zero.
 410  */
 411 static void
 412 brw_set_message_descriptor(struct brw_compile *p,
 413                            struct brw_instruction *inst,
 414                            enum brw_message_target sfid,
 415                            unsigned msg_length,
 416                            unsigned response_length,
 417                            bool header_present,
 418                            bool end_of_thread)
 419 {
 420    struct brw_context *brw = p->brw;
 421
 422    brw_set_src1(p, inst, brw_imm_d(0));
 423
 424    if (brw->gen >= 5) {
 425       inst->bits3.generic_gen5.header_present = header_present;
 426       inst->bits3.generic_gen5.response_length = response_length;
 427       inst->bits3.generic_gen5.msg_length = msg_length;
 428       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 429
 430       if (brw->gen >= 6) {
 431          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 432          inst->header.destreg__conditionalmod = sfid;
 433       } else {
 434          /* Set Extended Message Descriptor (ex_desc) */
 435          inst->bits2.send_gen5.sfid = sfid;
 436          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 437       }
 438    } else {
 439       inst->bits3.generic.response_length = response_length;
 440       inst->bits3.generic.msg_length = msg_length;
 441       inst->bits3.generic.msg_target = sfid;
 442       inst->bits3.generic.end_of_thread = end_of_thread;
 443    }
 444 }
 445
 446 static void brw_set_math_message( struct brw_compile *p,
 447                                   struct brw_instruction *insn,
 448                                   GLuint function,
 449                                   GLuint integer_type,
 450                                   bool low_precision,
 451                                   GLuint dataType )
 452 {
 453    struct brw_context *brw = p->brw;
 454    unsigned msg_length;
 455    unsigned response_length;
 456
 457    /* Infer message length from the function */
 458    switch (function) {
 459    case BRW_MATH_FUNCTION_POW:
 460    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 461    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 462    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 463       msg_length = 2;
 464       break;
 465    default:
 466       msg_length = 1;
 467       break;
 468    }
 469
 470    /* Infer response length from the function */
 471    switch (function) {
 472    case BRW_MATH_FUNCTION_SINCOS:
 473    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 474       response_length = 2;
 475       break;
 476    default:
 477       response_length = 1;
 478       break;
 479    }
 480
 481
 482    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 483                               msg_length, response_length, false, false);
 484    if (brw->gen == 5) {
 485       insn->bits3.math_gen5.function = function;
 486       insn->bits3.math_gen5.int_type = integer_type;
 487       insn->bits3.math_gen5.precision = low_precision;
 488       insn->bits3.math_gen5.saturate = insn->header.saturate;
 489       insn->bits3.math_gen5.data_type = dataType;
 490       insn->bits3.math_gen5.snapshot = 0;
 491    } else {
 492       insn->bits3.math.function = function;
 493       insn->bits3.math.int_type = integer_type;
 494       insn->bits3.math.precision = low_precision;
 495       insn->bits3.math.saturate = insn->header.saturate;
 496       insn->bits3.math.data_type = dataType;
 497    }
 498    insn->header.saturate = 0;
 499 }
 500
 501
 502 static void brw_set_ff_sync_message(struct brw_compile *p,
 503                                     struct brw_instruction *insn,
 504                                     bool allocate,
 505                                     GLuint response_length,
 506                                     bool end_of_thread)
 507 {
 508    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 509                               1, response_length, true, end_of_thread);
 510    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 511    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 512    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 513    insn->bits3.urb_gen5.allocate = allocate;
 514    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 515    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 516 }
 517
 518 static void brw_set_urb_message( struct brw_compile *p,
 519                                  struct brw_instruction *insn,
 520                                  enum brw_urb_write_flags flags,
 521                                  GLuint msg_length,
 522                                  GLuint response_length,
 523                                  GLuint offset,
 524                                  GLuint swizzle_control )
 525 {
 526    struct brw_context *brw = p->brw;
 527
 528    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 529                               msg_length, response_length, true,
 530                               flags & BRW_URB_WRITE_EOT);
 531    if (brw->gen == 7) {
 532       if (flags & BRW_URB_WRITE_OWORD) {
 533          assert(msg_length == 2); /* header + one OWORD of data */
 534          insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
 535       } else {
 536          insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
 537       }
 538       insn->bits3.urb_gen7.offset = offset;
 539       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 540       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 541       insn->bits3.urb_gen7.per_slot_offset =
 542          flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
 543       insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 544    } else if (brw->gen >= 5) {
 545       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 546       insn->bits3.urb_gen5.offset = offset;
 547       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 548       insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 549       insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 550       insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 551    } else {
 552       insn->bits3.urb.opcode = 0;       /* ? */
 553       insn->bits3.urb.offset = offset;
 554       insn->bits3.urb.swizzle_control = swizzle_control;
 555       insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
 556       insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
 557       insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
 558    }
 559 }
 560
 561 void
 562 brw_set_dp_write_message(struct brw_compile *p,
 563                          struct brw_instruction *insn,
 564                          GLuint binding_table_index,
 565                          GLuint msg_control,
 566                          GLuint msg_type,
 567                          GLuint msg_length,
 568                          bool header_present,
 569                          GLuint last_render_target,
 570                          GLuint response_length,
 571                          GLuint end_of_thread,
 572                          GLuint send_commit_msg)
 573 {
 574    struct brw_context *brw = p->brw;
 575    unsigned sfid;
 576
 577    if (brw->gen >= 7) {
 578       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 579       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 580          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 581       else
 582          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 583    } else if (brw->gen == 6) {
 584       /* Use the render cache for all write messages. */
 585       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 586    } else {
 587       sfid = BRW_SFID_DATAPORT_WRITE;
 588    }
 589
 590    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 591                               header_present, end_of_thread);
 592
 593    if (brw->gen >= 7) {
 594       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 595       insn->bits3.gen7_dp.msg_control = msg_control;
 596       insn->bits3.gen7_dp.last_render_target = last_render_target;
 597       insn->bits3.gen7_dp.msg_type = msg_type;
 598    } else if (brw->gen == 6) {
 599       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 600       insn->bits3.gen6_dp.msg_control = msg_control;
 601       insn->bits3.gen6_dp.last_render_target = last_render_target;
 602       insn->bits3.gen6_dp.msg_type = msg_type;
 603       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 604    } else if (brw->gen == 5) {
 605       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 606       insn->bits3.dp_write_gen5.msg_control = msg_control;
 607       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 608       insn->bits3.dp_write_gen5.msg_type = msg_type;
 609       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 610    } else {
 611       insn->bits3.dp_write.binding_table_index = binding_table_index;
 612       insn->bits3.dp_write.msg_control = msg_control;
 613       insn->bits3.dp_write.last_render_target = last_render_target;
 614       insn->bits3.dp_write.msg_type = msg_type;
 615       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 616    }
 617 }
 618
 619 void
 620 brw_set_dp_read_message(struct brw_compile *p,
 621                         struct brw_instruction *insn,
 622                         GLuint binding_table_index,
 623                         GLuint msg_control,
 624                         GLuint msg_type,
 625                         GLuint target_cache,
 626                         GLuint msg_length,
 627                         bool header_present,
 628                         GLuint response_length)
 629 {
 630    struct brw_context *brw = p->brw;
 631    unsigned sfid;
 632
 633    if (brw->gen >= 7) {
 634       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 635    } else if (brw->gen == 6) {
 636       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 637          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 638       else
 639          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 640    } else {
 641       sfid = BRW_SFID_DATAPORT_READ;
 642    }
 643
 644    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 645                               header_present, false);
 646
 647    if (brw->gen >= 7) {
 648       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 649       insn->bits3.gen7_dp.msg_control = msg_control;
 650       insn->bits3.gen7_dp.last_render_target = 0;
 651       insn->bits3.gen7_dp.msg_type = msg_type;
 652    } else if (brw->gen == 6) {
 653       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 654       insn->bits3.gen6_dp.msg_control = msg_control;
 655       insn->bits3.gen6_dp.last_render_target = 0;
 656       insn->bits3.gen6_dp.msg_type = msg_type;
 657       insn->bits3.gen6_dp.send_commit_msg = 0;
 658    } else if (brw->gen == 5) {
 659       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 660       insn->bits3.dp_read_gen5.msg_control = msg_control;
 661       insn->bits3.dp_read_gen5.msg_type = msg_type;
 662       insn->bits3.dp_read_gen5.target_cache = target_cache;
 663    } else if (brw->is_g4x) {
 664       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 665       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 666       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 667       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 668    } else {
 669       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 670       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 671       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 672       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 673    }
 674 }
 675
 676 void
 677 brw_set_sampler_message(struct brw_compile *p,
 678                         struct brw_instruction *insn,
 679                         GLuint binding_table_index,
 680                         GLuint sampler,
 681                         GLuint msg_type,
 682                         GLuint response_length,
 683                         GLuint msg_length,
 684                         GLuint header_present,
 685                         GLuint simd_mode,
 686                         GLuint return_format)
 687 {
 688    struct brw_context *brw = p->brw;
 689
 690    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 691                               response_length, header_present, false);
 692
 693    if (brw->gen >= 7) {
 694       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 695       insn->bits3.sampler_gen7.sampler = sampler;
 696       insn->bits3.sampler_gen7.msg_type = msg_type;
 697       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 698    } else if (brw->gen >= 5) {
 699       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 700       insn->bits3.sampler_gen5.sampler = sampler;
 701       insn->bits3.sampler_gen5.msg_type = msg_type;
 702       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 703    } else if (brw->is_g4x) {
 704       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 705       insn->bits3.sampler_g4x.sampler = sampler;
 706       insn->bits3.sampler_g4x.msg_type = msg_type;
 707    } else {
 708       insn->bits3.sampler.binding_table_index = binding_table_index;
 709       insn->bits3.sampler.sampler = sampler;
 710       insn->bits3.sampler.msg_type = msg_type;
 711       insn->bits3.sampler.return_format = return_format;
 712    }
 713 }
 714
 715
 716 #define next_insn brw_next_insn
 717 struct brw_instruction *
 718 brw_next_insn(struct brw_compile *p, GLuint opcode)
 719 {
 720    struct brw_instruction *insn;
 721
 722    if (p->nr_insn + 1 > p->store_size) {
 723       if (0)
 724          printf("incresing the store size to %d\n", p->store_size << 1);
 725       p->store_size <<= 1;
 726       p->store = reralloc(p->mem_ctx, p->store,
 727                           struct brw_instruction, p->store_size);
 728       if (!p->store)
 729          assert(!"realloc eu store memeory failed");
 730    }
 731
 732    p->next_insn_offset += 16;
 733    insn = &p->store[p->nr_insn++];
 734    memcpy(insn, p->current, sizeof(*insn));
 735
 736    /* Reset this one-shot flag:
 737     */
 738
 739    if (p->current->header.destreg__conditionalmod) {
 740       p->current->header.destreg__conditionalmod = 0;
 741       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 742    }
 743
 744    insn->header.opcode = opcode;
 745    return insn;
 746 }
 747
 748 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 749                                          GLuint opcode,
 750                                          struct brw_reg dest,
 751                                          struct brw_reg src )
 752 {
 753    struct brw_instruction *insn = next_insn(p, opcode);
 754    brw_set_dest(p, insn, dest);
 755    brw_set_src0(p, insn, src);
 756    return insn;
 757 }
 758
 759 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 760                                         GLuint opcode,
 761                                         struct brw_reg dest,
 762                                         struct brw_reg src0,
 763                                         struct brw_reg src1 )
 764 {
 765    struct brw_instruction *insn = next_insn(p, opcode);
 766    brw_set_dest(p, insn, dest);
 767    brw_set_src0(p, insn, src0);
 768    brw_set_src1(p, insn, src1);
 769    return insn;
 770 }
 771
 772 static int
 773 get_3src_subreg_nr(struct brw_reg reg)
 774 {
 775    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
 776       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
 777       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
 778    } else {
 779       return reg.subnr / 4;
 780    }
 781 }
 782
 783 static struct brw_instruction *brw_alu3(struct brw_compile *p,
 784                                         GLuint opcode,
 785                                         struct brw_reg dest,
 786                                         struct brw_reg src0,
 787                                         struct brw_reg src1,
 788                                         struct brw_reg src2)
 789 {
 790    struct brw_context *brw = p->brw;
 791    struct brw_instruction *insn = next_insn(p, opcode);
 792
 793    gen7_convert_mrf_to_grf(p, &dest);
 794
 795    assert(insn->header.access_mode == BRW_ALIGN_16);
 796
 797    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
 798           dest.file == BRW_MESSAGE_REGISTER_FILE);
 799    assert(dest.nr < 128);
 800    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
 801    assert(dest.type == BRW_REGISTER_TYPE_F ||
 802           dest.type == BRW_REGISTER_TYPE_D ||
 803           dest.type == BRW_REGISTER_TYPE_UD);
 804    insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
 805    insn->bits1.da3src.dest_reg_nr = dest.nr;
 806    insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
 807    insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
 808    guess_execution_size(p, insn, dest);
 809
 810    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
 811    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
 812    assert(src0.nr < 128);
 813    insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
 814    insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
 815    insn->bits2.da3src.src0_reg_nr = src0.nr;
 816    insn->bits1.da3src.src0_abs = src0.abs;
 817    insn->bits1.da3src.src0_negate = src0.negate;
 818    insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
 819
 820    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
 821    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
 822    assert(src1.nr < 128);
 823    insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
 824    insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
 825    insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
 826    insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
 827    insn->bits3.da3src.src1_reg_nr = src1.nr;
 828    insn->bits1.da3src.src1_abs = src1.abs;
 829    insn->bits1.da3src.src1_negate = src1.negate;
 830
 831    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
 832    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
 833    assert(src2.nr < 128);
 834    insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
 835    insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
 836    insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
 837    insn->bits3.da3src.src2_reg_nr = src2.nr;
 838    insn->bits1.da3src.src2_abs = src2.abs;
 839    insn->bits1.da3src.src2_negate = src2.negate;
 840
 841    if (brw->gen >= 7) {
 842       /* Set both the source and destination types based on dest.type,
 843        * ignoring the source register types.  The MAD and LRP emitters ensure
 844        * that all four types are float.  The BFE and BFI2 emitters, however,
 845        * may send us mixed D and UD types and want us to ignore that and use
 846        * the destination type.
 847        */
 848       switch (dest.type) {
 849       case BRW_REGISTER_TYPE_F:
 850          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
 851          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
 852          break;
 853       case BRW_REGISTER_TYPE_D:
 854          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
 855          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
 856          break;
 857       case BRW_REGISTER_TYPE_UD:
 858          insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
 859          insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
 860          break;
 861       }
 862    }
 863
 864    return insn;
 865 }
 866
 867
 868 /***********************************************************************
 869  * Convenience routines.
 870  */
 871 #define ALU1(OP)                                        \
 872 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 873               struct brw_reg dest,                      \
 874               struct brw_reg src0)                      \
 875 {                                                       \
 876    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 877 }
 878
 879 #define ALU2(OP)                                        \
 880 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 881               struct brw_reg dest,                      \
 882               struct brw_reg src0,                      \
 883               struct brw_reg src1)                      \
 884 {                                                       \
 885    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 886 }
 887
 888 #define ALU3(OP)                                        \
 889 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 890               struct brw_reg dest,                      \
 891               struct brw_reg src0,                      \
 892               struct brw_reg src1,                      \
 893               struct brw_reg src2)                      \
 894 {                                                       \
 895    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 896 }
 897
 898 #define ALU3F(OP)                                               \
 899 struct brw_instruction *brw_##OP(struct brw_compile *p,         \
 900                                  struct brw_reg dest,           \
 901                                  struct brw_reg src0,           \
 902                                  struct brw_reg src1,           \
 903                                  struct brw_reg src2)           \
 904 {                                                               \
 905    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
 906    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
 907    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
 908    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
 909    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 910 }
 911
 912 /* Rounding operations (other than RNDD) require two instructions - the first
 913  * stores a rounded value (possibly the wrong way) in the dest register, but
 914  * also sets a per-channel "increment bit" in the flag register.  A predicated
 915  * add of 1.0 fixes dest to contain the desired result.
 916  *
 917  * Sandybridge and later appear to round correctly without an ADD.
 918  */
 919 #define ROUND(OP)                                                             \
 920 void brw_##OP(struct brw_compile *p,                                          \
 921               struct brw_reg dest,                                            \
 922               struct brw_reg src)                                             \
 923 {                                                                             \
 924    struct brw_instruction *rnd, *add;                                         \
 925    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 926    brw_set_dest(p, rnd, dest);                                                \
 927    brw_set_src0(p, rnd, src);                                                 \
 928                                                                               \
 929    if (p->brw->gen < 6) {                                                     \
 930       /* turn on round-increments */                                          \
 931       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 932       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 933       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 934    }                                                                          \
 935 }
 936
 937
 938 ALU1(MOV)
 939 ALU2(SEL)
 940 ALU1(NOT)
 941 ALU2(AND)
 942 ALU2(OR)
 943 ALU2(XOR)
 944 ALU2(SHR)
 945 ALU2(SHL)
 946 ALU2(ASR)
 947 ALU1(F32TO16)
 948 ALU1(F16TO32)
 949 ALU1(FRC)
 950 ALU1(RNDD)
 951 ALU2(MAC)
 952 ALU2(MACH)
 953 ALU1(LZD)
 954 ALU2(DP4)
 955 ALU2(DPH)
 956 ALU2(DP3)
 957 ALU2(DP2)
 958 ALU2(LINE)
 959 ALU2(PLN)
 960 ALU3F(MAD)
 961 ALU3F(LRP)
 962 ALU1(BFREV)
 963 ALU3(BFE)
 964 ALU2(BFI1)
 965 ALU3(BFI2)
 966 ALU1(FBH)
 967 ALU1(FBL)
 968 ALU1(CBIT)
 969
 970 ROUND(RNDZ)
 971 ROUND(RNDE)
 972
 973
 974 struct brw_instruction *brw_ADD(struct brw_compile *p,
 975                                 struct brw_reg dest,
 976                                 struct brw_reg src0,
 977                                 struct brw_reg src1)
 978 {
 979    /* 6.2.2: add */
 980    if (src0.type == BRW_REGISTER_TYPE_F ||
 981        (src0.file == BRW_IMMEDIATE_VALUE &&
 982         src0.type == BRW_REGISTER_TYPE_VF)) {
 983       assert(src1.type != BRW_REGISTER_TYPE_UD);
 984       assert(src1.type != BRW_REGISTER_TYPE_D);
 985    }
 986
 987    if (src1.type == BRW_REGISTER_TYPE_F ||
 988        (src1.file == BRW_IMMEDIATE_VALUE &&
 989         src1.type == BRW_REGISTER_TYPE_VF)) {
 990       assert(src0.type != BRW_REGISTER_TYPE_UD);
 991       assert(src0.type != BRW_REGISTER_TYPE_D);
 992    }
 993
 994    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 995 }
 996
 997 struct brw_instruction *brw_AVG(struct brw_compile *p,
 998                                 struct brw_reg dest,
 999                                 struct brw_reg src0,
1000                                 struct brw_reg src1)
1001 {
1002    assert(dest.type == src0.type);
1003    assert(src0.type == src1.type);
1004    switch (src0.type) {
1005    case BRW_REGISTER_TYPE_B:
1006    case BRW_REGISTER_TYPE_UB:
1007    case BRW_REGISTER_TYPE_W:
1008    case BRW_REGISTER_TYPE_UW:
1009    case BRW_REGISTER_TYPE_D:
1010    case BRW_REGISTER_TYPE_UD:
1011       break;
1012    default:
1013       assert(!"Bad type for brw_AVG");
1014    }
1015
1016    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1017 }
1018
1019 struct brw_instruction *brw_MUL(struct brw_compile *p,
1020                                 struct brw_reg dest,
1021                                 struct brw_reg src0,
1022                                 struct brw_reg src1)
1023 {
1024    /* 6.32.38: mul */
1025    if (src0.type == BRW_REGISTER_TYPE_D ||
1026        src0.type == BRW_REGISTER_TYPE_UD ||
1027        src1.type == BRW_REGISTER_TYPE_D ||
1028        src1.type == BRW_REGISTER_TYPE_UD) {
1029       assert(dest.type != BRW_REGISTER_TYPE_F);
1030    }
1031
1032    if (src0.type == BRW_REGISTER_TYPE_F ||
1033        (src0.file == BRW_IMMEDIATE_VALUE &&
1034         src0.type == BRW_REGISTER_TYPE_VF)) {
1035       assert(src1.type != BRW_REGISTER_TYPE_UD);
1036       assert(src1.type != BRW_REGISTER_TYPE_D);
1037    }
1038
1039    if (src1.type == BRW_REGISTER_TYPE_F ||
1040        (src1.file == BRW_IMMEDIATE_VALUE &&
1041         src1.type == BRW_REGISTER_TYPE_VF)) {
1042       assert(src0.type != BRW_REGISTER_TYPE_UD);
1043       assert(src0.type != BRW_REGISTER_TYPE_D);
1044    }
1045
1046    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1047           src0.nr != BRW_ARF_ACCUMULATOR);
1048    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1049           src1.nr != BRW_ARF_ACCUMULATOR);
1050
1051    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1052 }
1053
1054
1055 void brw_NOP(struct brw_compile *p)
1056 {
1057    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1058    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1059    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1060    brw_set_src1(p, insn, brw_imm_ud(0x0));
1061 }
1062
1063
1064
1065
1066
1067 /***********************************************************************
1068  * Comparisons, if/else/endif
1069  */
1070
1071 struct brw_instruction *brw_JMPI(struct brw_compile *p,
1072                                  struct brw_reg dest,
1073                                  struct brw_reg src0,
1074                                  struct brw_reg src1)
1075 {
1076    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1077
1078    insn->header.execution_size = 1;
1079    insn->header.compression_control = BRW_COMPRESSION_NONE;
1080    insn->header.mask_control = BRW_MASK_DISABLE;
1081
1082    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1083
1084    return insn;
1085 }
1086
1087 static void
1088 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1089 {
1090    p->if_stack[p->if_stack_depth] = inst - p->store;
1091
1092    p->if_stack_depth++;
1093    if (p->if_stack_array_size <= p->if_stack_depth) {
1094       p->if_stack_array_size *= 2;
1095       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1096                              p->if_stack_array_size);
1097    }
1098 }
1099
1100 static struct brw_instruction *
1101 pop_if_stack(struct brw_compile *p)
1102 {
1103    p->if_stack_depth--;
1104    return &p->store[p->if_stack[p->if_stack_depth]];
1105 }
1106
1107 static void
1108 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1109 {
1110    if (p->loop_stack_array_size < p->loop_stack_depth) {
1111       p->loop_stack_array_size *= 2;
1112       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1113                                p->loop_stack_array_size);
1114       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1115                                      p->loop_stack_array_size);
1116    }
1117
1118    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1119    p->loop_stack_depth++;
1120    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1121 }
1122
1123 static struct brw_instruction *
1124 get_inner_do_insn(struct brw_compile *p)
1125 {
1126    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1127 }
1128
1129 /* EU takes the value from the flag register and pushes it onto some
1130  * sort of a stack (presumably merging with any flag value already on
1131  * the stack).  Within an if block, the flags at the top of the stack
1132  * control execution on each channel of the unit, eg. on each of the
1133  * 16 pixel values in our wm programs.
1134  *
1135  * When the matching 'else' instruction is reached (presumably by
1136  * countdown of the instruction count patched in by our ELSE/ENDIF
1137  * functions), the relevent flags are inverted.
1138  *
1139  * When the matching 'endif' instruction is reached, the flags are
1140  * popped off.  If the stack is now empty, normal execution resumes.
1141  */
1142 struct brw_instruction *
1143 brw_IF(struct brw_compile *p, GLuint execute_size)
1144 {
1145    struct brw_context *brw = p->brw;
1146    struct brw_instruction *insn;
1147
1148    insn = next_insn(p, BRW_OPCODE_IF);
1149
1150    /* Override the defaults for this instruction:
1151     */
1152    if (brw->gen < 6) {
1153       brw_set_dest(p, insn, brw_ip_reg());
1154       brw_set_src0(p, insn, brw_ip_reg());
1155       brw_set_src1(p, insn, brw_imm_d(0x0));
1156    } else if (brw->gen == 6) {
1157       brw_set_dest(p, insn, brw_imm_w(0));
1158       insn->bits1.branch_gen6.jump_count = 0;
1159       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1160       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1161    } else {
1162       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1163       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1164       brw_set_src1(p, insn, brw_imm_ud(0));
1165       insn->bits3.break_cont.jip = 0;
1166       insn->bits3.break_cont.uip = 0;
1167    }
1168
1169    insn->header.execution_size = execute_size;
1170    insn->header.compression_control = BRW_COMPRESSION_NONE;
1171    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1172    insn->header.mask_control = BRW_MASK_ENABLE;
1173    if (!p->single_program_flow)
1174       insn->header.thread_control = BRW_THREAD_SWITCH;
1175
1176    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1177
1178    push_if_stack(p, insn);
1179    p->if_depth_in_loop[p->loop_stack_depth]++;
1180    return insn;
1181 }
1182
1183 /* This function is only used for gen6-style IF instructions with an
1184  * embedded comparison (conditional modifier).  It is not used on gen7.
1185  */
1186 struct brw_instruction *
1187 gen6_IF(struct brw_compile *p, uint32_t conditional,
1188         struct brw_reg src0, struct brw_reg src1)
1189 {
1190    struct brw_instruction *insn;
1191
1192    insn = next_insn(p, BRW_OPCODE_IF);
1193
1194    brw_set_dest(p, insn, brw_imm_w(0));
1195    if (p->compressed) {
1196       insn->header.execution_size = BRW_EXECUTE_16;
1197    } else {
1198       insn->header.execution_size = BRW_EXECUTE_8;
1199    }
1200    insn->bits1.branch_gen6.jump_count = 0;
1201    brw_set_src0(p, insn, src0);
1202    brw_set_src1(p, insn, src1);
1203
1204    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1205    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1206    insn->header.destreg__conditionalmod = conditional;
1207
1208    if (!p->single_program_flow)
1209       insn->header.thread_control = BRW_THREAD_SWITCH;
1210
1211    push_if_stack(p, insn);
1212    return insn;
1213 }
1214
1215 /**
1216  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1217  */
1218 static void
1219 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1220                        struct brw_instruction *if_inst,
1221                        struct brw_instruction *else_inst)
1222 {
1223    /* The next instruction (where the ENDIF would be, if it existed) */
1224    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1225
1226    assert(p->single_program_flow);
1227    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1228    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1229    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1230
1231    /* Convert IF to an ADD instruction that moves the instruction pointer
1232     * to the first instruction of the ELSE block.  If there is no ELSE
1233     * block, point to where ENDIF would be.  Reverse the predicate.
1234     *
1235     * There's no need to execute an ENDIF since we don't need to do any
1236     * stack operations, and if we're currently executing, we just want to
1237     * continue normally.
1238     */
1239    if_inst->header.opcode = BRW_OPCODE_ADD;
1240    if_inst->header.predicate_inverse = 1;
1241
1242    if (else_inst != NULL) {
1243       /* Convert ELSE to an ADD instruction that points where the ENDIF
1244        * would be.
1245        */
1246       else_inst->header.opcode = BRW_OPCODE_ADD;
1247
1248       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1249       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1250    } else {
1251       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1252    }
1253 }
1254
1255 /**
1256  * Patch IF and ELSE instructions with appropriate jump targets.
1257  */
1258 static void
1259 patch_IF_ELSE(struct brw_compile *p,
1260               struct brw_instruction *if_inst,
1261               struct brw_instruction *else_inst,
1262               struct brw_instruction *endif_inst)
1263 {
1264    struct brw_context *brw = p->brw;
1265
1266    /* We shouldn't be patching IF and ELSE instructions in single program flow
1267     * mode when gen < 6, because in single program flow mode on those
1268     * platforms, we convert flow control instructions to conditional ADDs that
1269     * operate on IP (see brw_ENDIF).
1270     *
1271     * However, on Gen6, writing to IP doesn't work in single program flow mode
1272     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1273     * not be updated by non-flow control instructions.").  And on later
1274     * platforms, there is no significant benefit to converting control flow
1275     * instructions to conditional ADDs.  So we do patch IF and ELSE
1276     * instructions in single program flow mode on those platforms.
1277     */
1278    if (brw->gen < 6)
1279       assert(!p->single_program_flow);
1280
1281    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1282    assert(endif_inst != NULL);
1283    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1284
1285    unsigned br = 1;
1286    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1287     * requires 2 chunks.
1288     */
1289    if (brw->gen >= 5)
1290       br = 2;
1291
1292    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1293    endif_inst->header.execution_size = if_inst->header.execution_size;
1294
1295    if (else_inst == NULL) {
1296       /* Patch IF -> ENDIF */
1297       if (brw->gen < 6) {
1298          /* Turn it into an IFF, which means no mask stack operations for
1299           * all-false and jumping past the ENDIF.
1300           */
1301          if_inst->header.opcode = BRW_OPCODE_IFF;
1302          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1303          if_inst->bits3.if_else.pop_count = 0;
1304          if_inst->bits3.if_else.pad0 = 0;
1305       } else if (brw->gen == 6) {
1306          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1307          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1308       } else {
1309          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1310          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1311       }
1312    } else {
1313       else_inst->header.execution_size = if_inst->header.execution_size;
1314
1315       /* Patch IF -> ELSE */
1316       if (brw->gen < 6) {
1317          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1318          if_inst->bits3.if_else.pop_count = 0;
1319          if_inst->bits3.if_else.pad0 = 0;
1320       } else if (brw->gen == 6) {
1321          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1322       }
1323
1324       /* Patch ELSE -> ENDIF */
1325       if (brw->gen < 6) {
1326          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1327           * matching ENDIF.
1328           */
1329          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1330          else_inst->bits3.if_else.pop_count = 1;
1331          else_inst->bits3.if_else.pad0 = 0;
1332       } else if (brw->gen == 6) {
1333          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1334          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1335       } else {
1336          /* The IF instruction's JIP should point just past the ELSE */
1337          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1338          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1339          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1340          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1341       }
1342    }
1343 }
1344
1345 void
1346 brw_ELSE(struct brw_compile *p)
1347 {
1348    struct brw_context *brw = p->brw;
1349    struct brw_instruction *insn;
1350
1351    insn = next_insn(p, BRW_OPCODE_ELSE);
1352
1353    if (brw->gen < 6) {
1354       brw_set_dest(p, insn, brw_ip_reg());
1355       brw_set_src0(p, insn, brw_ip_reg());
1356       brw_set_src1(p, insn, brw_imm_d(0x0));
1357    } else if (brw->gen == 6) {
1358       brw_set_dest(p, insn, brw_imm_w(0));
1359       insn->bits1.branch_gen6.jump_count = 0;
1360       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1361       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1362    } else {
1363       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1364       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1365       brw_set_src1(p, insn, brw_imm_ud(0));
1366       insn->bits3.break_cont.jip = 0;
1367       insn->bits3.break_cont.uip = 0;
1368    }
1369
1370    insn->header.compression_control = BRW_COMPRESSION_NONE;
1371    insn->header.mask_control = BRW_MASK_ENABLE;
1372    if (!p->single_program_flow)
1373       insn->header.thread_control = BRW_THREAD_SWITCH;
1374
1375    push_if_stack(p, insn);
1376 }
1377
1378 void
1379 brw_ENDIF(struct brw_compile *p)
1380 {
1381    struct brw_context *brw = p->brw;
1382    struct brw_instruction *insn = NULL;
1383    struct brw_instruction *else_inst = NULL;
1384    struct brw_instruction *if_inst = NULL;
1385    struct brw_instruction *tmp;
1386    bool emit_endif = true;
1387
1388    /* In single program flow mode, we can express IF and ELSE instructions
1389     * equivalently as ADD instructions that operate on IP.  On platforms prior
1390     * to Gen6, flow control instructions cause an implied thread switch, so
1391     * this is a significant savings.
1392     *
1393     * However, on Gen6, writing to IP doesn't work in single program flow mode
1394     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1395     * not be updated by non-flow control instructions.").  And on later
1396     * platforms, there is no significant benefit to converting control flow
1397     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1398     * Gen5.
1399     */
1400    if (brw->gen < 6 && p->single_program_flow)
1401       emit_endif = false;
1402
1403    /*
1404     * A single next_insn() may change the base adress of instruction store
1405     * memory(p->store), so call it first before referencing the instruction
1406     * store pointer from an index
1407     */
1408    if (emit_endif)
1409       insn = next_insn(p, BRW_OPCODE_ENDIF);
1410
1411    /* Pop the IF and (optional) ELSE instructions from the stack */
1412    p->if_depth_in_loop[p->loop_stack_depth]--;
1413    tmp = pop_if_stack(p);
1414    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1415       else_inst = tmp;
1416       tmp = pop_if_stack(p);
1417    }
1418    if_inst = tmp;
1419
1420    if (!emit_endif) {
1421       /* ENDIF is useless; don't bother emitting it. */
1422       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1423       return;
1424    }
1425
1426    if (brw->gen < 6) {
1427       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1428       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1429       brw_set_src1(p, insn, brw_imm_d(0x0));
1430    } else if (brw->gen == 6) {
1431       brw_set_dest(p, insn, brw_imm_w(0));
1432       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1433       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1434    } else {
1435       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1436       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1437       brw_set_src1(p, insn, brw_imm_ud(0));
1438    }
1439
1440    insn->header.compression_control = BRW_COMPRESSION_NONE;
1441    insn->header.mask_control = BRW_MASK_ENABLE;
1442    insn->header.thread_control = BRW_THREAD_SWITCH;
1443
1444    /* Also pop item off the stack in the endif instruction: */
1445    if (brw->gen < 6) {
1446       insn->bits3.if_else.jump_count = 0;
1447       insn->bits3.if_else.pop_count = 1;
1448       insn->bits3.if_else.pad0 = 0;
1449    } else if (brw->gen == 6) {
1450       insn->bits1.branch_gen6.jump_count = 2;
1451    } else {
1452       insn->bits3.break_cont.jip = 2;
1453    }
1454    patch_IF_ELSE(p, if_inst, else_inst, insn);
1455 }
1456
1457 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1458 {
1459    struct brw_context *brw = p->brw;
1460    struct brw_instruction *insn;
1461
1462    insn = next_insn(p, BRW_OPCODE_BREAK);
1463    if (brw->gen >= 6) {
1464       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1465       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1466       brw_set_src1(p, insn, brw_imm_d(0x0));
1467    } else {
1468       brw_set_dest(p, insn, brw_ip_reg());
1469       brw_set_src0(p, insn, brw_ip_reg());
1470       brw_set_src1(p, insn, brw_imm_d(0x0));
1471       insn->bits3.if_else.pad0 = 0;
1472       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1473    }
1474    insn->header.compression_control = BRW_COMPRESSION_NONE;
1475    insn->header.execution_size = BRW_EXECUTE_8;
1476
1477    return insn;
1478 }
1479
1480 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1481 {
1482    struct brw_instruction *insn;
1483
1484    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1485    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1486    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1487    brw_set_dest(p, insn, brw_ip_reg());
1488    brw_set_src0(p, insn, brw_ip_reg());
1489    brw_set_src1(p, insn, brw_imm_d(0x0));
1490
1491    insn->header.compression_control = BRW_COMPRESSION_NONE;
1492    insn->header.execution_size = BRW_EXECUTE_8;
1493    return insn;
1494 }
1495
1496 struct brw_instruction *brw_CONT(struct brw_compile *p)
1497 {
1498    struct brw_instruction *insn;
1499    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1500    brw_set_dest(p, insn, brw_ip_reg());
1501    brw_set_src0(p, insn, brw_ip_reg());
1502    brw_set_src1(p, insn, brw_imm_d(0x0));
1503    insn->header.compression_control = BRW_COMPRESSION_NONE;
1504    insn->header.execution_size = BRW_EXECUTE_8;
1505    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1506    insn->bits3.if_else.pad0 = 0;
1507    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1508    return insn;
1509 }
1510
1511 struct brw_instruction *gen6_HALT(struct brw_compile *p)
1512 {
1513    struct brw_instruction *insn;
1514
1515    insn = next_insn(p, BRW_OPCODE_HALT);
1516    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1518    brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1519
1520    if (p->compressed) {
1521       insn->header.execution_size = BRW_EXECUTE_16;
1522    } else {
1523       insn->header.compression_control = BRW_COMPRESSION_NONE;
1524       insn->header.execution_size = BRW_EXECUTE_8;
1525    }
1526    return insn;
1527 }
1528
1529 /* DO/WHILE loop:
1530  *
1531  * The DO/WHILE is just an unterminated loop -- break or continue are
1532  * used for control within the loop.  We have a few ways they can be
1533  * done.
1534  *
1535  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1536  * jip and no DO instruction.
1537  *
1538  * For non-uniform control flow pre-gen6, there's a DO instruction to
1539  * push the mask, and a WHILE to jump back, and BREAK to get out and
1540  * pop the mask.
1541  *
1542  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1543  * just points back to the first instruction of the loop.
1544  */
1545 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1546 {
1547    struct brw_context *brw = p->brw;
1548
1549    if (brw->gen >= 6 || p->single_program_flow) {
1550       push_loop_stack(p, &p->store[p->nr_insn]);
1551       return &p->store[p->nr_insn];
1552    } else {
1553       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1554
1555       push_loop_stack(p, insn);
1556
1557       /* Override the defaults for this instruction:
1558        */
1559       brw_set_dest(p, insn, brw_null_reg());
1560       brw_set_src0(p, insn, brw_null_reg());
1561       brw_set_src1(p, insn, brw_null_reg());
1562
1563       insn->header.compression_control = BRW_COMPRESSION_NONE;
1564       insn->header.execution_size = execute_size;
1565       insn->header.predicate_control = BRW_PREDICATE_NONE;
1566       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1567       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1568
1569       return insn;
1570    }
1571 }
1572
1573 /**
1574  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1575  * instruction here.
1576  *
1577  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1578  * nesting, since it can always just point to the end of the block/current loop.
1579  */
1580 static void
1581 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1582 {
1583    struct brw_context *brw = p->brw;
1584    struct brw_instruction *do_inst = get_inner_do_insn(p);
1585    struct brw_instruction *inst;
1586    int br = (brw->gen == 5) ? 2 : 1;
1587
1588    for (inst = while_inst - 1; inst != do_inst; inst--) {
1589       /* If the jump count is != 0, that means that this instruction has already
1590        * been patched because it's part of a loop inside of the one we're
1591        * patching.
1592        */
1593       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1594           inst->bits3.if_else.jump_count == 0) {
1595          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1596       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1597                  inst->bits3.if_else.jump_count == 0) {
1598          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1599       }
1600    }
1601 }
1602
1603 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1604 {
1605    struct brw_context *brw = p->brw;
1606    struct brw_instruction *insn, *do_insn;
1607    GLuint br = 1;
1608
1609    if (brw->gen >= 5)
1610       br = 2;
1611
1612    if (brw->gen >= 7) {
1613       insn = next_insn(p, BRW_OPCODE_WHILE);
1614       do_insn = get_inner_do_insn(p);
1615
1616       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1617       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1618       brw_set_src1(p, insn, brw_imm_ud(0));
1619       insn->bits3.break_cont.jip = br * (do_insn - insn);
1620
1621       insn->header.execution_size = BRW_EXECUTE_8;
1622    } else if (brw->gen == 6) {
1623       insn = next_insn(p, BRW_OPCODE_WHILE);
1624       do_insn = get_inner_do_insn(p);
1625
1626       brw_set_dest(p, insn, brw_imm_w(0));
1627       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1628       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1629       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1630
1631       insn->header.execution_size = BRW_EXECUTE_8;
1632    } else {
1633       if (p->single_program_flow) {
1634          insn = next_insn(p, BRW_OPCODE_ADD);
1635          do_insn = get_inner_do_insn(p);
1636
1637          brw_set_dest(p, insn, brw_ip_reg());
1638          brw_set_src0(p, insn, brw_ip_reg());
1639          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1640          insn->header.execution_size = BRW_EXECUTE_1;
1641       } else {
1642          insn = next_insn(p, BRW_OPCODE_WHILE);
1643          do_insn = get_inner_do_insn(p);
1644
1645          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1646
1647          brw_set_dest(p, insn, brw_ip_reg());
1648          brw_set_src0(p, insn, brw_ip_reg());
1649          brw_set_src1(p, insn, brw_imm_d(0));
1650
1651          insn->header.execution_size = do_insn->header.execution_size;
1652          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1653          insn->bits3.if_else.pop_count = 0;
1654          insn->bits3.if_else.pad0 = 0;
1655
1656          brw_patch_break_cont(p, insn);
1657       }
1658    }
1659    insn->header.compression_control = BRW_COMPRESSION_NONE;
1660    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1661
1662    p->loop_stack_depth--;
1663
1664    return insn;
1665 }
1666
1667
1668 /* FORWARD JUMPS:
1669  */
1670 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1671 {
1672    struct brw_context *brw = p->brw;
1673    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1674    GLuint jmpi = 1;
1675
1676    if (brw->gen >= 5)
1677       jmpi = 2;
1678
1679    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1680    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1681
1682    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1683 }
1684
1685
1686
1687 /* To integrate with the above, it makes sense that the comparison
1688  * instruction should populate the flag register.  It might be simpler
1689  * just to use the flag reg for most WM tasks?
1690  */
1691 void brw_CMP(struct brw_compile *p,
1692              struct brw_reg dest,
1693              GLuint conditional,
1694              struct brw_reg src0,
1695              struct brw_reg src1)
1696 {
1697    struct brw_context *brw = p->brw;
1698    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1699
1700    insn->header.destreg__conditionalmod = conditional;
1701    brw_set_dest(p, insn, dest);
1702    brw_set_src0(p, insn, src0);
1703    brw_set_src1(p, insn, src1);
1704
1705 /*    guess_execution_size(insn, src0); */
1706
1707
1708    /* Make it so that future instructions will use the computed flag
1709     * value until brw_set_predicate_control_flag_value() is called
1710     * again.
1711     */
1712    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1713        dest.nr == 0) {
1714       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1715       p->flag_value = 0xff;
1716    }
1717
1718    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1719     * page says:
1720     *    "Any CMP instruction with a null destination must use a {switch}."
1721     *
1722     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1723     * mentioned on their work-arounds pages.
1724     */
1725    if (brw->gen == 7) {
1726       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1727           dest.nr == BRW_ARF_NULL) {
1728          insn->header.thread_control = BRW_THREAD_SWITCH;
1729       }
1730    }
1731 }
1732
1733 /* Issue 'wait' instruction for n1, host could program MMIO
1734    to wake up thread. */
1735 void brw_WAIT (struct brw_compile *p)
1736 {
1737    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1738    struct brw_reg src = brw_notification_1_reg();
1739
1740    brw_set_dest(p, insn, src);
1741    brw_set_src0(p, insn, src);
1742    brw_set_src1(p, insn, brw_null_reg());
1743    insn->header.execution_size = 0; /* must */
1744    insn->header.predicate_control = 0;
1745    insn->header.compression_control = 0;
1746 }
1747
1748
1749 /***********************************************************************
1750  * Helpers for the various SEND message types:
1751  */
1752
1753 /** Extended math function, float[8].
1754  */
1755 void brw_math( struct brw_compile *p,
1756                struct brw_reg dest,
1757                GLuint function,
1758                GLuint msg_reg_nr,
1759                struct brw_reg src,
1760                GLuint data_type,
1761                GLuint precision )
1762 {
1763    struct brw_context *brw = p->brw;
1764
1765    if (brw->gen >= 6) {
1766       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1767
1768       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1769              (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1770       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1771
1772       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1773       if (brw->gen == 6)
1774          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1775
1776       /* Source modifiers are ignored for extended math instructions on Gen6. */
1777       if (brw->gen == 6) {
1778          assert(!src.negate);
1779          assert(!src.abs);
1780       }
1781
1782       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1783           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1784           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1785          assert(src.type != BRW_REGISTER_TYPE_F);
1786       } else {
1787          assert(src.type == BRW_REGISTER_TYPE_F);
1788       }
1789
1790       /* Math is the same ISA format as other opcodes, except that CondModifier
1791        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1792        */
1793       insn->header.destreg__conditionalmod = function;
1794
1795       brw_set_dest(p, insn, dest);
1796       brw_set_src0(p, insn, src);
1797       brw_set_src1(p, insn, brw_null_reg());
1798    } else {
1799       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1800
1801       /* Example code doesn't set predicate_control for send
1802        * instructions.
1803        */
1804       insn->header.predicate_control = 0;
1805       insn->header.destreg__conditionalmod = msg_reg_nr;
1806
1807       brw_set_dest(p, insn, dest);
1808       brw_set_src0(p, insn, src);
1809       brw_set_math_message(p,
1810                            insn,
1811                            function,
1812                            src.type == BRW_REGISTER_TYPE_D,
1813                            precision,
1814                            data_type);
1815    }
1816 }
1817
1818 /** Extended math function, float[8].
1819  */
1820 void brw_math2(struct brw_compile *p,
1821                struct brw_reg dest,
1822                GLuint function,
1823                struct brw_reg src0,
1824                struct brw_reg src1)
1825 {
1826    struct brw_context *brw = p->brw;
1827    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1828
1829    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1830           (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1831    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1832    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1833
1834    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1835    if (brw->gen == 6) {
1836       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1837       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1838    }
1839
1840    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1841        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1842        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1843       assert(src0.type != BRW_REGISTER_TYPE_F);
1844       assert(src1.type != BRW_REGISTER_TYPE_F);
1845    } else {
1846       assert(src0.type == BRW_REGISTER_TYPE_F);
1847       assert(src1.type == BRW_REGISTER_TYPE_F);
1848    }
1849
1850    /* Source modifiers are ignored for extended math instructions on Gen6. */
1851    if (brw->gen == 6) {
1852       assert(!src0.negate);
1853       assert(!src0.abs);
1854       assert(!src1.negate);
1855       assert(!src1.abs);
1856    }
1857
1858    /* Math is the same ISA format as other opcodes, except that CondModifier
1859     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1860     */
1861    insn->header.destreg__conditionalmod = function;
1862
1863    brw_set_dest(p, insn, dest);
1864    brw_set_src0(p, insn, src0);
1865    brw_set_src1(p, insn, src1);
1866 }
1867
1868
1869 /**
1870  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1871  * using a constant offset per channel.
1872  *
1873  * The offset must be aligned to oword size (16 bytes).  Used for
1874  * register spilling.
1875  */
1876 void brw_oword_block_write_scratch(struct brw_compile *p,
1877                                    struct brw_reg mrf,
1878                                    int num_regs,
1879                                    GLuint offset)
1880 {
1881    struct brw_context *brw = p->brw;
1882    uint32_t msg_control, msg_type;
1883    int mlen;
1884
1885    if (brw->gen >= 6)
1886       offset /= 16;
1887
1888    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1889
1890    if (num_regs == 1) {
1891       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1892       mlen = 2;
1893    } else {
1894       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1895       mlen = 3;
1896    }
1897
1898    /* Set up the message header.  This is g0, with g0.2 filled with
1899     * the offset.  We don't want to leave our offset around in g0 or
1900     * it'll screw up texture samples, so set it up inside the message
1901     * reg.
1902     */
1903    {
1904       brw_push_insn_state(p);
1905       brw_set_mask_control(p, BRW_MASK_DISABLE);
1906       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1907
1908       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1909
1910       /* set message header global offset field (reg 0, element 2) */
1911       brw_MOV(p,
1912               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1913                                   mrf.nr,
1914                                   2), BRW_REGISTER_TYPE_UD),
1915               brw_imm_ud(offset));
1916
1917       brw_pop_insn_state(p);
1918    }
1919
1920    {
1921       struct brw_reg dest;
1922       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1923       int send_commit_msg;
1924       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1925                                          BRW_REGISTER_TYPE_UW);
1926
1927       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1928          insn->header.compression_control = BRW_COMPRESSION_NONE;
1929          src_header = vec16(src_header);
1930       }
1931       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1932       insn->header.destreg__conditionalmod = mrf.nr;
1933
1934       /* Until gen6, writes followed by reads from the same location
1935        * are not guaranteed to be ordered unless write_commit is set.
1936        * If set, then a no-op write is issued to the destination
1937        * register to set a dependency, and a read from the destination
1938        * can be used to ensure the ordering.
1939        *
1940        * For gen6, only writes between different threads need ordering
1941        * protection.  Our use of DP writes is all about register
1942        * spilling within a thread.
1943        */
1944       if (brw->gen >= 6) {
1945          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1946          send_commit_msg = 0;
1947       } else {
1948          dest = src_header;
1949          send_commit_msg = 1;
1950       }
1951
1952       brw_set_dest(p, insn, dest);
1953       if (brw->gen >= 6) {
1954          brw_set_src0(p, insn, mrf);
1955       } else {
1956          brw_set_src0(p, insn, brw_null_reg());
1957       }
1958
1959       if (brw->gen >= 6)
1960          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1961       else
1962          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1963
1964       brw_set_dp_write_message(p,
1965                                insn,
1966                                255, /* binding table index (255=stateless) */
1967                                msg_control,
1968                                msg_type,
1969                                mlen,
1970                                true, /* header_present */
1971                                0, /* not a render target */
1972                                send_commit_msg, /* response_length */
1973                                0, /* eot */
1974                                send_commit_msg);
1975    }
1976 }
1977
1978
1979 /**
1980  * Read a block of owords (half a GRF each) from the scratch buffer
1981  * using a constant index per channel.
1982  *
1983  * Offset must be aligned to oword size (16 bytes).  Used for register
1984  * spilling.
1985  */
1986 void
1987 brw_oword_block_read_scratch(struct brw_compile *p,
1988                              struct brw_reg dest,
1989                              struct brw_reg mrf,
1990                              int num_regs,
1991                              GLuint offset)
1992 {
1993    struct brw_context *brw = p->brw;
1994    uint32_t msg_control;
1995    int rlen;
1996
1997    if (brw->gen >= 6)
1998       offset /= 16;
1999
2000    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2001    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2002
2003    if (num_regs == 1) {
2004       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2005       rlen = 1;
2006    } else {
2007       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2008       rlen = 2;
2009    }
2010
2011    {
2012       brw_push_insn_state(p);
2013       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2014       brw_set_mask_control(p, BRW_MASK_DISABLE);
2015
2016       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2017
2018       /* set message header global offset field (reg 0, element 2) */
2019       brw_MOV(p,
2020               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2021                                   mrf.nr,
2022                                   2), BRW_REGISTER_TYPE_UD),
2023               brw_imm_ud(offset));
2024
2025       brw_pop_insn_state(p);
2026    }
2027
2028    {
2029       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2030
2031       assert(insn->header.predicate_control == 0);
2032       insn->header.compression_control = BRW_COMPRESSION_NONE;
2033       insn->header.destreg__conditionalmod = mrf.nr;
2034
2035       brw_set_dest(p, insn, dest);      /* UW? */
2036       if (brw->gen >= 6) {
2037          brw_set_src0(p, insn, mrf);
2038       } else {
2039          brw_set_src0(p, insn, brw_null_reg());
2040       }
2041
2042       brw_set_dp_read_message(p,
2043                               insn,
2044                               255, /* binding table index (255=stateless) */
2045                               msg_control,
2046                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2047                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2048                               1, /* msg_length */
2049                               true, /* header_present */
2050                               rlen);
2051    }
2052 }
2053
2054 /**
2055  * Read a float[4] vector from the data port Data Cache (const buffer).
2056  * Location (in buffer) should be a multiple of 16.
2057  * Used for fetching shader constants.
2058  */
2059 void brw_oword_block_read(struct brw_compile *p,
2060                           struct brw_reg dest,
2061                           struct brw_reg mrf,
2062                           uint32_t offset,
2063                           uint32_t bind_table_index)
2064 {
2065    struct brw_context *brw = p->brw;
2066
2067    /* On newer hardware, offset is in units of owords. */
2068    if (brw->gen >= 6)
2069       offset /= 16;
2070
2071    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2072
2073    brw_push_insn_state(p);
2074    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2075    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2076    brw_set_mask_control(p, BRW_MASK_DISABLE);
2077
2078    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2079
2080    /* set message header global offset field (reg 0, element 2) */
2081    brw_MOV(p,
2082            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2083                                mrf.nr,
2084                                2), BRW_REGISTER_TYPE_UD),
2085            brw_imm_ud(offset));
2086
2087    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2088    insn->header.destreg__conditionalmod = mrf.nr;
2089
2090    /* cast dest to a uword[8] vector */
2091    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2092
2093    brw_set_dest(p, insn, dest);
2094    if (brw->gen >= 6) {
2095       brw_set_src0(p, insn, mrf);
2096    } else {
2097       brw_set_src0(p, insn, brw_null_reg());
2098    }
2099
2100    brw_set_dp_read_message(p,
2101                            insn,
2102                            bind_table_index,
2103                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2104                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2105                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2106                            1, /* msg_length */
2107                            true, /* header_present */
2108                            1); /* response_length (1 reg, 2 owords!) */
2109
2110    brw_pop_insn_state(p);
2111 }
2112
2113
2114 void brw_fb_WRITE(struct brw_compile *p,
2115                   int dispatch_width,
2116                   GLuint msg_reg_nr,
2117                   struct brw_reg src0,
2118                   GLuint msg_control,
2119                   GLuint binding_table_index,
2120                   GLuint msg_length,
2121                   GLuint response_length,
2122                   bool eot,
2123                   bool header_present)
2124 {
2125    struct brw_context *brw = p->brw;
2126    struct brw_instruction *insn;
2127    GLuint msg_type;
2128    struct brw_reg dest;
2129
2130    if (dispatch_width == 16)
2131       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2132    else
2133       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2134
2135    if (brw->gen >= 6) {
2136       insn = next_insn(p, BRW_OPCODE_SENDC);
2137    } else {
2138       insn = next_insn(p, BRW_OPCODE_SEND);
2139    }
2140    /* The execution mask is ignored for render target writes. */
2141    insn->header.predicate_control = 0;
2142    insn->header.compression_control = BRW_COMPRESSION_NONE;
2143
2144    if (brw->gen >= 6) {
2145       /* headerless version, just submit color payload */
2146       src0 = brw_message_reg(msg_reg_nr);
2147
2148       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2149    } else {
2150       insn->header.destreg__conditionalmod = msg_reg_nr;
2151
2152       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2153    }
2154
2155    brw_set_dest(p, insn, dest);
2156    brw_set_src0(p, insn, src0);
2157    brw_set_dp_write_message(p,
2158                             insn,
2159                             binding_table_index,
2160                             msg_control,
2161                             msg_type,
2162                             msg_length,
2163                             header_present,
2164                             eot, /* last render target write */
2165                             response_length,
2166                             eot,
2167                             0 /* send_commit_msg */);
2168 }
2169
2170
2171 /**
2172  * Texture sample instruction.
2173  * Note: the msg_type plus msg_length values determine exactly what kind
2174  * of sampling operation is performed.  See volume 4, page 161 of docs.
2175  */
2176 void brw_SAMPLE(struct brw_compile *p,
2177                 struct brw_reg dest,
2178                 GLuint msg_reg_nr,
2179                 struct brw_reg src0,
2180                 GLuint binding_table_index,
2181                 GLuint sampler,
2182                 GLuint msg_type,
2183                 GLuint response_length,
2184                 GLuint msg_length,
2185                 GLuint header_present,
2186                 GLuint simd_mode,
2187                 GLuint return_format)
2188 {
2189    struct brw_context *brw = p->brw;
2190    struct brw_instruction *insn;
2191
2192    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2193
2194    insn = next_insn(p, BRW_OPCODE_SEND);
2195    insn->header.predicate_control = 0; /* XXX */
2196    insn->header.compression_control = BRW_COMPRESSION_NONE;
2197    if (brw->gen < 6)
2198       insn->header.destreg__conditionalmod = msg_reg_nr;
2199
2200    brw_set_dest(p, insn, dest);
2201    brw_set_src0(p, insn, src0);
2202    brw_set_sampler_message(p, insn,
2203                            binding_table_index,
2204                            sampler,
2205                            msg_type,
2206                            response_length,
2207                            msg_length,
2208                            header_present,
2209                            simd_mode,
2210                            return_format);
2211 }
2212
2213 /* All these variables are pretty confusing - we might be better off
2214  * using bitmasks and macros for this, in the old style.  Or perhaps
2215  * just having the caller instantiate the fields in dword3 itself.
2216  */
2217 void brw_urb_WRITE(struct brw_compile *p,
2218                    struct brw_reg dest,
2219                    GLuint msg_reg_nr,
2220                    struct brw_reg src0,
2221                    enum brw_urb_write_flags flags,
2222                    GLuint msg_length,
2223                    GLuint response_length,
2224                    GLuint offset,
2225                    GLuint swizzle)
2226 {
2227    struct brw_context *brw = p->brw;
2228    struct brw_instruction *insn;
2229
2230    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2231
2232    if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2233       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2234       brw_push_insn_state(p);
2235       brw_set_access_mode(p, BRW_ALIGN_1);
2236       brw_set_mask_control(p, BRW_MASK_DISABLE);
2237       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2238                        BRW_REGISTER_TYPE_UD),
2239                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2240                 brw_imm_ud(0xff00));
2241       brw_pop_insn_state(p);
2242    }
2243
2244    insn = next_insn(p, BRW_OPCODE_SEND);
2245
2246    assert(msg_length < BRW_MAX_MRF);
2247
2248    brw_set_dest(p, insn, dest);
2249    brw_set_src0(p, insn, src0);
2250    brw_set_src1(p, insn, brw_imm_d(0));
2251
2252    if (brw->gen < 6)
2253       insn->header.destreg__conditionalmod = msg_reg_nr;
2254
2255    brw_set_urb_message(p,
2256                        insn,
2257                        flags,
2258                        msg_length,
2259                        response_length,
2260                        offset,
2261                        swizzle);
2262 }
2263
2264 static int
2265 next_ip(struct brw_compile *p, int ip)
2266 {
2267    struct brw_instruction *insn = (void *)p->store + ip;
2268
2269    if (insn->header.cmpt_control)
2270       return ip + 8;
2271    else
2272       return ip + 16;
2273 }
2274
2275 static int
2276 brw_find_next_block_end(struct brw_compile *p, int start)
2277 {
2278    int ip;
2279    void *store = p->store;
2280
2281    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2282       struct brw_instruction *insn = store + ip;
2283
2284       switch (insn->header.opcode) {
2285       case BRW_OPCODE_ENDIF:
2286       case BRW_OPCODE_ELSE:
2287       case BRW_OPCODE_WHILE:
2288       case BRW_OPCODE_HALT:
2289          return ip;
2290       }
2291    }
2292
2293    return 0;
2294 }
2295
2296 /* There is no DO instruction on gen6, so to find the end of the loop
2297  * we have to see if the loop is jumping back before our start
2298  * instruction.
2299  */
2300 static int
2301 brw_find_loop_end(struct brw_compile *p, int start)
2302 {
2303    struct brw_context *brw = p->brw;
2304    int ip;
2305    int scale = 8;
2306    void *store = p->store;
2307
2308    /* Always start after the instruction (such as a WHILE) we're trying to fix
2309     * up.
2310     */
2311    for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2312       struct brw_instruction *insn = store + ip;
2313
2314       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2315          int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2316                                    : insn->bits3.break_cont.jip;
2317          if (ip + jip * scale <= start)
2318             return ip;
2319       }
2320    }
2321    assert(!"not reached");
2322    return start;
2323 }
2324
2325 /* After program generation, go back and update the UIP and JIP of
2326  * BREAK, CONT, and HALT instructions to their correct locations.
2327  */
2328 void
2329 brw_set_uip_jip(struct brw_compile *p)
2330 {
2331    struct brw_context *brw = p->brw;
2332    int ip;
2333    int scale = 8;
2334    void *store = p->store;
2335
2336    if (brw->gen < 6)
2337       return;
2338
2339    for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2340       struct brw_instruction *insn = store + ip;
2341
2342       if (insn->header.cmpt_control) {
2343          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2344          assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2345                 insn->header.opcode != BRW_OPCODE_CONTINUE &&
2346                 insn->header.opcode != BRW_OPCODE_HALT);
2347          continue;
2348       }
2349
2350       int block_end_ip = brw_find_next_block_end(p, ip);
2351       switch (insn->header.opcode) {
2352       case BRW_OPCODE_BREAK:
2353          assert(block_end_ip != 0);
2354          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2355          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2356          insn->bits3.break_cont.uip =
2357             (brw_find_loop_end(p, ip) - ip +
2358              (brw->gen == 6 ? 16 : 0)) / scale;
2359          break;
2360       case BRW_OPCODE_CONTINUE:
2361          assert(block_end_ip != 0);
2362          insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2363          insn->bits3.break_cont.uip =
2364             (brw_find_loop_end(p, ip) - ip) / scale;
2365
2366          assert(insn->bits3.break_cont.uip != 0);
2367          assert(insn->bits3.break_cont.jip != 0);
2368          break;
2369
2370       case BRW_OPCODE_ENDIF:
2371          if (block_end_ip == 0)
2372             insn->bits3.break_cont.jip = 2;
2373          else
2374             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2375          break;
2376
2377       case BRW_OPCODE_HALT:
2378          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2379           *
2380           *    "In case of the halt instruction not inside any conditional
2381           *     code block, the value of <JIP> and <UIP> should be the
2382           *     same. In case of the halt instruction inside conditional code
2383           *     block, the <UIP> should be the end of the program, and the
2384           *     <JIP> should be end of the most inner conditional code block."
2385           *
2386           * The uip will have already been set by whoever set up the
2387           * instruction.
2388           */
2389          if (block_end_ip == 0) {
2390             insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2391          } else {
2392             insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2393          }
2394          assert(insn->bits3.break_cont.uip != 0);
2395          assert(insn->bits3.break_cont.jip != 0);
2396          break;
2397       }
2398    }
2399 }
2400
2401 void brw_ff_sync(struct brw_compile *p,
2402                    struct brw_reg dest,
2403                    GLuint msg_reg_nr,
2404                    struct brw_reg src0,
2405                    bool allocate,
2406                    GLuint response_length,
2407                    bool eot)
2408 {
2409    struct brw_context *brw = p->brw;
2410    struct brw_instruction *insn;
2411
2412    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2413
2414    insn = next_insn(p, BRW_OPCODE_SEND);
2415    brw_set_dest(p, insn, dest);
2416    brw_set_src0(p, insn, src0);
2417    brw_set_src1(p, insn, brw_imm_d(0));
2418
2419    if (brw->gen < 6)
2420       insn->header.destreg__conditionalmod = msg_reg_nr;
2421
2422    brw_set_ff_sync_message(p,
2423                            insn,
2424                            allocate,
2425                            response_length,
2426                            eot);
2427 }
2428
2429 /**
2430  * Emit the SEND instruction necessary to generate stream output data on Gen6
2431  * (for transform feedback).
2432  *
2433  * If send_commit_msg is true, this is the last piece of stream output data
2434  * from this thread, so send the data as a committed write.  According to the
2435  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2436  *
2437  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2438  *   writes are complete by sending the final write as a committed write."
2439  */
2440 void
2441 brw_svb_write(struct brw_compile *p,
2442               struct brw_reg dest,
2443               GLuint msg_reg_nr,
2444               struct brw_reg src0,
2445               GLuint binding_table_index,
2446               bool   send_commit_msg)
2447 {
2448    struct brw_instruction *insn;
2449
2450    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2451
2452    insn = next_insn(p, BRW_OPCODE_SEND);
2453    brw_set_dest(p, insn, dest);
2454    brw_set_src0(p, insn, src0);
2455    brw_set_src1(p, insn, brw_imm_d(0));
2456    brw_set_dp_write_message(p, insn,
2457                             binding_table_index,
2458                             0, /* msg_control: ignored */
2459                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2460                             1, /* msg_length */
2461                             true, /* header_present */
2462                             0, /* last_render_target: ignored */
2463                             send_commit_msg, /* response_length */
2464                             0, /* end_of_thread */
2465                             send_commit_msg); /* send_commit_msg */
2466 }
2467
2468 /**
2469  * This instruction is generated as a single-channel align1 instruction by
2470  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2471  *
2472  * We can't use the typed atomic op in the FS because that has the execution
2473  * mask ANDed with the pixel mask, but we just want to write the one dword for
2474  * all the pixels.
2475  *
2476  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2477  * one u32.  So we use the same untyped atomic write message as the pixel
2478  * shader.
2479  *
2480  * The untyped atomic operation requires a BUFFER surface type with RAW
2481  * format, and is only accessible through the legacy DATA_CACHE dataport
2482  * messages.
2483  */
2484 void brw_shader_time_add(struct brw_compile *p,
2485                          struct brw_reg payload,
2486                          uint32_t surf_index)
2487 {
2488    struct brw_context *brw = p->brw;
2489    assert(brw->gen >= 7);
2490
2491    brw_push_insn_state(p);
2492    brw_set_access_mode(p, BRW_ALIGN_1);
2493    brw_set_mask_control(p, BRW_MASK_DISABLE);
2494    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2495    brw_pop_insn_state(p);
2496
2497    /* We use brw_vec1_reg and unmasked because we want to increment the given
2498     * offset only once.
2499     */
2500    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2501                                       BRW_ARF_NULL, 0));
2502    brw_set_src0(p, send, brw_vec1_reg(payload.file,
2503                                       payload.nr, 0));
2504
2505    uint32_t sfid, msg_type;
2506    if (brw->is_haswell) {
2507       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2508       msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2509    } else {
2510       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2511       msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2512    }
2513
2514    bool header_present = false;
2515    bool eot = false;
2516    uint32_t mlen = 2; /* offset, value */
2517    uint32_t rlen = 0;
2518    brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2519
2520    send->bits3.ud |= msg_type << 14;
2521    send->bits3.ud |= 0 << 13; /* no return data */
2522    send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2523    send->bits3.ud |= BRW_AOP_ADD << 8;
2524    send->bits3.ud |= surf_index << 0;
2525 }