src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    struct intel_context *intel = &p->brw->intel;
  88    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  89       reg->file = BRW_GENERAL_REGISTER_FILE;
  90       reg->nr += 111;
  91    }
  92 }
  93
  94
  95 void
  96 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
  97              struct brw_reg dest)
  98 {
  99    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 100        dest.file != BRW_MESSAGE_REGISTER_FILE)
 101       assert(dest.nr < 128);
 102
 103    gen7_convert_mrf_to_grf(p, &dest);
 104
 105    insn->bits1.da1.dest_reg_file = dest.file;
 106    insn->bits1.da1.dest_reg_type = dest.type;
 107    insn->bits1.da1.dest_address_mode = dest.address_mode;
 108
 109    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 110       insn->bits1.da1.dest_reg_nr = dest.nr;
 111
 112       if (insn->header.access_mode == BRW_ALIGN_1) {
 113          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 114          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 115             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 116          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 117       }
 118       else {
 119          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 120          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 121          /* even ignored in da16, still need to set as '01' */
 122          insn->bits1.da16.dest_horiz_stride = 1;
 123       }
 124    }
 125    else {
 126       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 127
 128       /* These are different sizes in align1 vs align16:
 129        */
 130       if (insn->header.access_mode == BRW_ALIGN_1) {
 131          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 132          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 133             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 134          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 135       }
 136       else {
 137          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 138          /* even ignored in da16, still need to set as '01' */
 139          insn->bits1.ia16.dest_horiz_stride = 1;
 140       }
 141    }
 142
 143    /* NEW: Set the execution size based on dest.width and
 144     * insn->compression_control:
 145     */
 146    guess_execution_size(p, insn, dest);
 147 }
 148
 149 extern int reg_type_size[];
 150
 151 static void
 152 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 153 {
 154    int hstride_for_reg[] = {0, 1, 2, 4};
 155    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 156    int width_for_reg[] = {1, 2, 4, 8, 16};
 157    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 158    int width, hstride, vstride, execsize;
 159
 160    if (reg.file == BRW_IMMEDIATE_VALUE) {
 161       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 162        * mean the destination has to be 128-bit aligned and the
 163        * destination horiz stride has to be a word.
 164        */
 165       if (reg.type == BRW_REGISTER_TYPE_V) {
 166          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 167                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 168       }
 169
 170       return;
 171    }
 172
 173    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 174        reg.file == BRW_ARF_NULL)
 175       return;
 176
 177    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 178    hstride = hstride_for_reg[reg.hstride];
 179
 180    if (reg.vstride == 0xf) {
 181       vstride = -1;
 182    } else {
 183       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 184       vstride = vstride_for_reg[reg.vstride];
 185    }
 186
 187    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 188    width = width_for_reg[reg.width];
 189
 190    assert(insn->header.execution_size >= 0 &&
 191           insn->header.execution_size < Elements(execsize_for_reg));
 192    execsize = execsize_for_reg[insn->header.execution_size];
 193
 194    /* Restrictions from 3.3.10: Register Region Restrictions. */
 195    /* 3. */
 196    assert(execsize >= width);
 197
 198    /* 4. */
 199    if (execsize == width && hstride != 0) {
 200       assert(vstride == -1 || vstride == width * hstride);
 201    }
 202
 203    /* 5. */
 204    if (execsize == width && hstride == 0) {
 205       /* no restriction on vstride. */
 206    }
 207
 208    /* 6. */
 209    if (width == 1) {
 210       assert(hstride == 0);
 211    }
 212
 213    /* 7. */
 214    if (execsize == 1 && width == 1) {
 215       assert(hstride == 0);
 216       assert(vstride == 0);
 217    }
 218
 219    /* 8. */
 220    if (vstride == 0 && hstride == 0) {
 221       assert(width == 1);
 222    }
 223
 224    /* 10. Check destination issues. */
 225 }
 226
 227 void
 228 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 229              struct brw_reg reg)
 230 {
 231    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 232       assert(reg.nr < 128);
 233
 234    gen7_convert_mrf_to_grf(p, &reg);
 235
 236    validate_reg(insn, reg);
 237
 238    insn->bits1.da1.src0_reg_file = reg.file;
 239    insn->bits1.da1.src0_reg_type = reg.type;
 240    insn->bits2.da1.src0_abs = reg.abs;
 241    insn->bits2.da1.src0_negate = reg.negate;
 242    insn->bits2.da1.src0_address_mode = reg.address_mode;
 243
 244    if (reg.file == BRW_IMMEDIATE_VALUE) {
 245       insn->bits3.ud = reg.dw1.ud;
 246
 247       /* Required to set some fields in src1 as well:
 248        */
 249       insn->bits1.da1.src1_reg_file = 0; /* arf */
 250       insn->bits1.da1.src1_reg_type = reg.type;
 251    }
 252    else
 253    {
 254       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 255          if (insn->header.access_mode == BRW_ALIGN_1) {
 256             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 257             insn->bits2.da1.src0_reg_nr = reg.nr;
 258          }
 259          else {
 260             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 261             insn->bits2.da16.src0_reg_nr = reg.nr;
 262          }
 263       }
 264       else {
 265          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 266
 267          if (insn->header.access_mode == BRW_ALIGN_1) {
 268             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 269          }
 270          else {
 271             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 272          }
 273       }
 274
 275       if (insn->header.access_mode == BRW_ALIGN_1) {
 276          if (reg.width == BRW_WIDTH_1 &&
 277              insn->header.execution_size == BRW_EXECUTE_1) {
 278             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 279             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 280             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 281          }
 282          else {
 283             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 284             insn->bits2.da1.src0_width = reg.width;
 285             insn->bits2.da1.src0_vert_stride = reg.vstride;
 286          }
 287       }
 288       else {
 289          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 290          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 291          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 292          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 293
 294          /* This is an oddity of the fact we're using the same
 295           * descriptions for registers in align_16 as align_1:
 296           */
 297          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 298             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 299          else
 300             insn->bits2.da16.src0_vert_stride = reg.vstride;
 301       }
 302    }
 303 }
 304
 305
 306 void brw_set_src1(struct brw_compile *p,
 307                   struct brw_instruction *insn,
 308                   struct brw_reg reg)
 309 {
 310    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 311
 312    assert(reg.nr < 128);
 313
 314    gen7_convert_mrf_to_grf(p, &reg);
 315
 316    validate_reg(insn, reg);
 317
 318    insn->bits1.da1.src1_reg_file = reg.file;
 319    insn->bits1.da1.src1_reg_type = reg.type;
 320    insn->bits3.da1.src1_abs = reg.abs;
 321    insn->bits3.da1.src1_negate = reg.negate;
 322
 323    /* Only src1 can be immediate in two-argument instructions.
 324     */
 325    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 326
 327    if (reg.file == BRW_IMMEDIATE_VALUE) {
 328       insn->bits3.ud = reg.dw1.ud;
 329    }
 330    else {
 331       /* This is a hardware restriction, which may or may not be lifted
 332        * in the future:
 333        */
 334       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 335       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 336
 337       if (insn->header.access_mode == BRW_ALIGN_1) {
 338          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 339          insn->bits3.da1.src1_reg_nr = reg.nr;
 340       }
 341       else {
 342          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 343          insn->bits3.da16.src1_reg_nr = reg.nr;
 344       }
 345
 346       if (insn->header.access_mode == BRW_ALIGN_1) {
 347          if (reg.width == BRW_WIDTH_1 &&
 348              insn->header.execution_size == BRW_EXECUTE_1) {
 349             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 350             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 351             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 352          }
 353          else {
 354             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 355             insn->bits3.da1.src1_width = reg.width;
 356             insn->bits3.da1.src1_vert_stride = reg.vstride;
 357          }
 358       }
 359       else {
 360          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 361          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 362          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 363          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 364
 365          /* This is an oddity of the fact we're using the same
 366           * descriptions for registers in align_16 as align_1:
 367           */
 368          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 369             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 370          else
 371             insn->bits3.da16.src1_vert_stride = reg.vstride;
 372       }
 373    }
 374 }
 375
 376 /**
 377  * Set the Message Descriptor and Extended Message Descriptor fields
 378  * for SEND messages.
 379  *
 380  * \note This zeroes out the Function Control bits, so it must be called
 381  *       \b before filling out any message-specific data.  Callers can
 382  *       choose not to fill in irrelevant bits; they will be zero.
 383  */
 384 static void
 385 brw_set_message_descriptor(struct brw_compile *p,
 386                            struct brw_instruction *inst,
 387                            enum brw_message_target sfid,
 388                            unsigned msg_length,
 389                            unsigned response_length,
 390                            bool header_present,
 391                            bool end_of_thread)
 392 {
 393    struct intel_context *intel = &p->brw->intel;
 394
 395    brw_set_src1(p, inst, brw_imm_d(0));
 396
 397    if (intel->gen >= 5) {
 398       inst->bits3.generic_gen5.header_present = header_present;
 399       inst->bits3.generic_gen5.response_length = response_length;
 400       inst->bits3.generic_gen5.msg_length = msg_length;
 401       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 402
 403       if (intel->gen >= 6) {
 404          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 405          inst->header.destreg__conditionalmod = sfid;
 406       } else {
 407          /* Set Extended Message Descriptor (ex_desc) */
 408          inst->bits2.send_gen5.sfid = sfid;
 409          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 410       }
 411    } else {
 412       inst->bits3.generic.response_length = response_length;
 413       inst->bits3.generic.msg_length = msg_length;
 414       inst->bits3.generic.msg_target = sfid;
 415       inst->bits3.generic.end_of_thread = end_of_thread;
 416    }
 417 }
 418
 419 static void brw_set_math_message( struct brw_compile *p,
 420                                   struct brw_instruction *insn,
 421                                   GLuint function,
 422                                   GLuint integer_type,
 423                                   bool low_precision,
 424                                   bool saturate,
 425                                   GLuint dataType )
 426 {
 427    struct brw_context *brw = p->brw;
 428    struct intel_context *intel = &brw->intel;
 429    unsigned msg_length;
 430    unsigned response_length;
 431
 432    /* Infer message length from the function */
 433    switch (function) {
 434    case BRW_MATH_FUNCTION_POW:
 435    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 436    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 437    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 438       msg_length = 2;
 439       break;
 440    default:
 441       msg_length = 1;
 442       break;
 443    }
 444
 445    /* Infer response length from the function */
 446    switch (function) {
 447    case BRW_MATH_FUNCTION_SINCOS:
 448    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 449       response_length = 2;
 450       break;
 451    default:
 452       response_length = 1;
 453       break;
 454    }
 455
 456    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 457                               msg_length, response_length, false, false);
 458    if (intel->gen == 5) {
 459       insn->bits3.math_gen5.function = function;
 460       insn->bits3.math_gen5.int_type = integer_type;
 461       insn->bits3.math_gen5.precision = low_precision;
 462       insn->bits3.math_gen5.saturate = saturate;
 463       insn->bits3.math_gen5.data_type = dataType;
 464       insn->bits3.math_gen5.snapshot = 0;
 465    } else {
 466       insn->bits3.math.function = function;
 467       insn->bits3.math.int_type = integer_type;
 468       insn->bits3.math.precision = low_precision;
 469       insn->bits3.math.saturate = saturate;
 470       insn->bits3.math.data_type = dataType;
 471    }
 472 }
 473
 474
 475 static void brw_set_ff_sync_message(struct brw_compile *p,
 476                                     struct brw_instruction *insn,
 477                                     bool allocate,
 478                                     GLuint response_length,
 479                                     bool end_of_thread)
 480 {
 481    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 482                               1, response_length, true, end_of_thread);
 483    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 484    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 485    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 486    insn->bits3.urb_gen5.allocate = allocate;
 487    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 488    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 489 }
 490
 491 static void brw_set_urb_message( struct brw_compile *p,
 492                                  struct brw_instruction *insn,
 493                                  bool allocate,
 494                                  bool used,
 495                                  GLuint msg_length,
 496                                  GLuint response_length,
 497                                  bool end_of_thread,
 498                                  bool complete,
 499                                  GLuint offset,
 500                                  GLuint swizzle_control )
 501 {
 502    struct brw_context *brw = p->brw;
 503    struct intel_context *intel = &brw->intel;
 504
 505    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 506                               msg_length, response_length, true, end_of_thread);
 507    if (intel->gen == 7) {
 508       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 509       insn->bits3.urb_gen7.offset = offset;
 510       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 511       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 512       /* per_slot_offset = 0 makes it ignore offsets in message header */
 513       insn->bits3.urb_gen7.per_slot_offset = 0;
 514       insn->bits3.urb_gen7.complete = complete;
 515    } else if (intel->gen >= 5) {
 516       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 517       insn->bits3.urb_gen5.offset = offset;
 518       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 519       insn->bits3.urb_gen5.allocate = allocate;
 520       insn->bits3.urb_gen5.used = used; /* ? */
 521       insn->bits3.urb_gen5.complete = complete;
 522    } else {
 523       insn->bits3.urb.opcode = 0;       /* ? */
 524       insn->bits3.urb.offset = offset;
 525       insn->bits3.urb.swizzle_control = swizzle_control;
 526       insn->bits3.urb.allocate = allocate;
 527       insn->bits3.urb.used = used;      /* ? */
 528       insn->bits3.urb.complete = complete;
 529    }
 530 }
 531
 532 void
 533 brw_set_dp_write_message(struct brw_compile *p,
 534                          struct brw_instruction *insn,
 535                          GLuint binding_table_index,
 536                          GLuint msg_control,
 537                          GLuint msg_type,
 538                          GLuint msg_length,
 539                          bool header_present,
 540                          GLuint last_render_target,
 541                          GLuint response_length,
 542                          GLuint end_of_thread,
 543                          GLuint send_commit_msg)
 544 {
 545    struct brw_context *brw = p->brw;
 546    struct intel_context *intel = &brw->intel;
 547    unsigned sfid;
 548
 549    if (intel->gen >= 7) {
 550       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 551       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 552          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 553       else
 554          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 555    } else if (intel->gen == 6) {
 556       /* Use the render cache for all write messages. */
 557       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 558    } else {
 559       sfid = BRW_SFID_DATAPORT_WRITE;
 560    }
 561
 562    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 563                               header_present, end_of_thread);
 564
 565    if (intel->gen >= 7) {
 566       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 567       insn->bits3.gen7_dp.msg_control = msg_control;
 568       insn->bits3.gen7_dp.last_render_target = last_render_target;
 569       insn->bits3.gen7_dp.msg_type = msg_type;
 570    } else if (intel->gen == 6) {
 571       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 572       insn->bits3.gen6_dp.msg_control = msg_control;
 573       insn->bits3.gen6_dp.last_render_target = last_render_target;
 574       insn->bits3.gen6_dp.msg_type = msg_type;
 575       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 576    } else if (intel->gen == 5) {
 577       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 578       insn->bits3.dp_write_gen5.msg_control = msg_control;
 579       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 580       insn->bits3.dp_write_gen5.msg_type = msg_type;
 581       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 582    } else {
 583       insn->bits3.dp_write.binding_table_index = binding_table_index;
 584       insn->bits3.dp_write.msg_control = msg_control;
 585       insn->bits3.dp_write.last_render_target = last_render_target;
 586       insn->bits3.dp_write.msg_type = msg_type;
 587       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 588    }
 589 }
 590
 591 void
 592 brw_set_dp_read_message(struct brw_compile *p,
 593                         struct brw_instruction *insn,
 594                         GLuint binding_table_index,
 595                         GLuint msg_control,
 596                         GLuint msg_type,
 597                         GLuint target_cache,
 598                         GLuint msg_length,
 599                         GLuint response_length)
 600 {
 601    struct brw_context *brw = p->brw;
 602    struct intel_context *intel = &brw->intel;
 603    unsigned sfid;
 604
 605    if (intel->gen >= 7) {
 606       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 607    } else if (intel->gen == 6) {
 608       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 609          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 610       else
 611          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 612    } else {
 613       sfid = BRW_SFID_DATAPORT_READ;
 614    }
 615
 616    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 617                               true, false);
 618
 619    if (intel->gen >= 7) {
 620       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 621       insn->bits3.gen7_dp.msg_control = msg_control;
 622       insn->bits3.gen7_dp.last_render_target = 0;
 623       insn->bits3.gen7_dp.msg_type = msg_type;
 624    } else if (intel->gen == 6) {
 625       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 626       insn->bits3.gen6_dp.msg_control = msg_control;
 627       insn->bits3.gen6_dp.last_render_target = 0;
 628       insn->bits3.gen6_dp.msg_type = msg_type;
 629       insn->bits3.gen6_dp.send_commit_msg = 0;
 630    } else if (intel->gen == 5) {
 631       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 632       insn->bits3.dp_read_gen5.msg_control = msg_control;
 633       insn->bits3.dp_read_gen5.msg_type = msg_type;
 634       insn->bits3.dp_read_gen5.target_cache = target_cache;
 635    } else if (intel->is_g4x) {
 636       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 637       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 638       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 639       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 640    } else {
 641       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 642       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 643       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 644       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 645    }
 646 }
 647
 648 static void brw_set_sampler_message(struct brw_compile *p,
 649                                     struct brw_instruction *insn,
 650                                     GLuint binding_table_index,
 651                                     GLuint sampler,
 652                                     GLuint msg_type,
 653                                     GLuint response_length,
 654                                     GLuint msg_length,
 655                                     GLuint header_present,
 656                                     GLuint simd_mode,
 657                                     GLuint return_format)
 658 {
 659    struct brw_context *brw = p->brw;
 660    struct intel_context *intel = &brw->intel;
 661
 662    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 663                               response_length, header_present, false);
 664
 665    if (intel->gen >= 7) {
 666       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 667       insn->bits3.sampler_gen7.sampler = sampler;
 668       insn->bits3.sampler_gen7.msg_type = msg_type;
 669       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 670    } else if (intel->gen >= 5) {
 671       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 672       insn->bits3.sampler_gen5.sampler = sampler;
 673       insn->bits3.sampler_gen5.msg_type = msg_type;
 674       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 675    } else if (intel->is_g4x) {
 676       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 677       insn->bits3.sampler_g4x.sampler = sampler;
 678       insn->bits3.sampler_g4x.msg_type = msg_type;
 679    } else {
 680       insn->bits3.sampler.binding_table_index = binding_table_index;
 681       insn->bits3.sampler.sampler = sampler;
 682       insn->bits3.sampler.msg_type = msg_type;
 683       insn->bits3.sampler.return_format = return_format;
 684    }
 685 }
 686
 687
 688 #define next_insn brw_next_insn
 689 struct brw_instruction *
 690 brw_next_insn(struct brw_compile *p, GLuint opcode)
 691 {
 692    struct brw_instruction *insn;
 693
 694    if (p->nr_insn + 1 > p->store_size) {
 695       if (0)
 696          printf("incresing the store size to %d\n", p->store_size << 1);
 697       p->store_size <<= 1;
 698       p->store = reralloc(p->mem_ctx, p->store,
 699                           struct brw_instruction, p->store_size);
 700       if (!p->store)
 701          assert(!"realloc eu store memeory failed");
 702    }
 703
 704    insn = &p->store[p->nr_insn++];
 705    memcpy(insn, p->current, sizeof(*insn));
 706
 707    /* Reset this one-shot flag:
 708     */
 709
 710    if (p->current->header.destreg__conditionalmod) {
 711       p->current->header.destreg__conditionalmod = 0;
 712       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 713    }
 714
 715    insn->header.opcode = opcode;
 716    return insn;
 717 }
 718
 719 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 720                                          GLuint opcode,
 721                                          struct brw_reg dest,
 722                                          struct brw_reg src )
 723 {
 724    struct brw_instruction *insn = next_insn(p, opcode);
 725    brw_set_dest(p, insn, dest);
 726    brw_set_src0(p, insn, src);
 727    return insn;
 728 }
 729
 730 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 731                                         GLuint opcode,
 732                                         struct brw_reg dest,
 733                                         struct brw_reg src0,
 734                                         struct brw_reg src1 )
 735 {
 736    struct brw_instruction *insn = next_insn(p, opcode);
 737    brw_set_dest(p, insn, dest);
 738    brw_set_src0(p, insn, src0);
 739    brw_set_src1(p, insn, src1);
 740    return insn;
 741 }
 742
 743
 744 /***********************************************************************
 745  * Convenience routines.
 746  */
 747 #define ALU1(OP)                                        \
 748 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 749               struct brw_reg dest,                      \
 750               struct brw_reg src0)                      \
 751 {                                                       \
 752    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 753 }
 754
 755 #define ALU2(OP)                                        \
 756 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 757               struct brw_reg dest,                      \
 758               struct brw_reg src0,                      \
 759               struct brw_reg src1)                      \
 760 {                                                       \
 761    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 762 }
 763
 764 /* Rounding operations (other than RNDD) require two instructions - the first
 765  * stores a rounded value (possibly the wrong way) in the dest register, but
 766  * also sets a per-channel "increment bit" in the flag register.  A predicated
 767  * add of 1.0 fixes dest to contain the desired result.
 768  *
 769  * Sandybridge and later appear to round correctly without an ADD.
 770  */
 771 #define ROUND(OP)                                                             \
 772 void brw_##OP(struct brw_compile *p,                                          \
 773               struct brw_reg dest,                                            \
 774               struct brw_reg src)                                             \
 775 {                                                                             \
 776    struct brw_instruction *rnd, *add;                                         \
 777    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 778    brw_set_dest(p, rnd, dest);                                                \
 779    brw_set_src0(p, rnd, src);                                                 \
 780                                                                               \
 781    if (p->brw->intel.gen < 6) {                                               \
 782       /* turn on round-increments */                                          \
 783       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 784       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 785       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 786    }                                                                          \
 787 }
 788
 789
 790 ALU1(MOV)
 791 ALU2(SEL)
 792 ALU1(NOT)
 793 ALU2(AND)
 794 ALU2(OR)
 795 ALU2(XOR)
 796 ALU2(SHR)
 797 ALU2(SHL)
 798 ALU2(RSR)
 799 ALU2(RSL)
 800 ALU2(ASR)
 801 ALU1(FRC)
 802 ALU1(RNDD)
 803 ALU2(MAC)
 804 ALU2(MACH)
 805 ALU1(LZD)
 806 ALU2(DP4)
 807 ALU2(DPH)
 808 ALU2(DP3)
 809 ALU2(DP2)
 810 ALU2(LINE)
 811 ALU2(PLN)
 812
 813
 814 ROUND(RNDZ)
 815 ROUND(RNDE)
 816
 817
 818 struct brw_instruction *brw_ADD(struct brw_compile *p,
 819                                 struct brw_reg dest,
 820                                 struct brw_reg src0,
 821                                 struct brw_reg src1)
 822 {
 823    /* 6.2.2: add */
 824    if (src0.type == BRW_REGISTER_TYPE_F ||
 825        (src0.file == BRW_IMMEDIATE_VALUE &&
 826         src0.type == BRW_REGISTER_TYPE_VF)) {
 827       assert(src1.type != BRW_REGISTER_TYPE_UD);
 828       assert(src1.type != BRW_REGISTER_TYPE_D);
 829    }
 830
 831    if (src1.type == BRW_REGISTER_TYPE_F ||
 832        (src1.file == BRW_IMMEDIATE_VALUE &&
 833         src1.type == BRW_REGISTER_TYPE_VF)) {
 834       assert(src0.type != BRW_REGISTER_TYPE_UD);
 835       assert(src0.type != BRW_REGISTER_TYPE_D);
 836    }
 837
 838    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 839 }
 840
 841 struct brw_instruction *brw_MUL(struct brw_compile *p,
 842                                 struct brw_reg dest,
 843                                 struct brw_reg src0,
 844                                 struct brw_reg src1)
 845 {
 846    /* 6.32.38: mul */
 847    if (src0.type == BRW_REGISTER_TYPE_D ||
 848        src0.type == BRW_REGISTER_TYPE_UD ||
 849        src1.type == BRW_REGISTER_TYPE_D ||
 850        src1.type == BRW_REGISTER_TYPE_UD) {
 851       assert(dest.type != BRW_REGISTER_TYPE_F);
 852    }
 853
 854    if (src0.type == BRW_REGISTER_TYPE_F ||
 855        (src0.file == BRW_IMMEDIATE_VALUE &&
 856         src0.type == BRW_REGISTER_TYPE_VF)) {
 857       assert(src1.type != BRW_REGISTER_TYPE_UD);
 858       assert(src1.type != BRW_REGISTER_TYPE_D);
 859    }
 860
 861    if (src1.type == BRW_REGISTER_TYPE_F ||
 862        (src1.file == BRW_IMMEDIATE_VALUE &&
 863         src1.type == BRW_REGISTER_TYPE_VF)) {
 864       assert(src0.type != BRW_REGISTER_TYPE_UD);
 865       assert(src0.type != BRW_REGISTER_TYPE_D);
 866    }
 867
 868    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 869           src0.nr != BRW_ARF_ACCUMULATOR);
 870    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 871           src1.nr != BRW_ARF_ACCUMULATOR);
 872
 873    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
 874 }
 875
 876
 877 void brw_NOP(struct brw_compile *p)
 878 {
 879    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
 880    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 881    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 882    brw_set_src1(p, insn, brw_imm_ud(0x0));
 883 }
 884
 885
 886
 887
 888
 889 /***********************************************************************
 890  * Comparisons, if/else/endif
 891  */
 892
 893 struct brw_instruction *brw_JMPI(struct brw_compile *p,
 894                                  struct brw_reg dest,
 895                                  struct brw_reg src0,
 896                                  struct brw_reg src1)
 897 {
 898    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 899
 900    insn->header.execution_size = 1;
 901    insn->header.compression_control = BRW_COMPRESSION_NONE;
 902    insn->header.mask_control = BRW_MASK_DISABLE;
 903
 904    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 905
 906    return insn;
 907 }
 908
 909 static void
 910 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
 911 {
 912    p->if_stack[p->if_stack_depth] = inst - p->store;
 913
 914    p->if_stack_depth++;
 915    if (p->if_stack_array_size <= p->if_stack_depth) {
 916       p->if_stack_array_size *= 2;
 917       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
 918                              p->if_stack_array_size);
 919    }
 920 }
 921
 922 static struct brw_instruction *
 923 pop_if_stack(struct brw_compile *p)
 924 {
 925    p->if_stack_depth--;
 926    return &p->store[p->if_stack[p->if_stack_depth]];
 927 }
 928
 929 static void
 930 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
 931 {
 932    if (p->loop_stack_array_size < p->loop_stack_depth) {
 933       p->loop_stack_array_size *= 2;
 934       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
 935                                p->loop_stack_array_size);
 936       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
 937                                      p->loop_stack_array_size);
 938    }
 939
 940    p->loop_stack[p->loop_stack_depth] = inst - p->store;
 941    p->loop_stack_depth++;
 942    p->if_depth_in_loop[p->loop_stack_depth] = 0;
 943 }
 944
 945 static struct brw_instruction *
 946 get_inner_do_insn(struct brw_compile *p)
 947 {
 948    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
 949 }
 950
 951 /* EU takes the value from the flag register and pushes it onto some
 952  * sort of a stack (presumably merging with any flag value already on
 953  * the stack).  Within an if block, the flags at the top of the stack
 954  * control execution on each channel of the unit, eg. on each of the
 955  * 16 pixel values in our wm programs.
 956  *
 957  * When the matching 'else' instruction is reached (presumably by
 958  * countdown of the instruction count patched in by our ELSE/ENDIF
 959  * functions), the relevent flags are inverted.
 960  *
 961  * When the matching 'endif' instruction is reached, the flags are
 962  * popped off.  If the stack is now empty, normal execution resumes.
 963  */
 964 struct brw_instruction *
 965 brw_IF(struct brw_compile *p, GLuint execute_size)
 966 {
 967    struct intel_context *intel = &p->brw->intel;
 968    struct brw_instruction *insn;
 969
 970    insn = next_insn(p, BRW_OPCODE_IF);
 971
 972    /* Override the defaults for this instruction:
 973     */
 974    if (intel->gen < 6) {
 975       brw_set_dest(p, insn, brw_ip_reg());
 976       brw_set_src0(p, insn, brw_ip_reg());
 977       brw_set_src1(p, insn, brw_imm_d(0x0));
 978    } else if (intel->gen == 6) {
 979       brw_set_dest(p, insn, brw_imm_w(0));
 980       insn->bits1.branch_gen6.jump_count = 0;
 981       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 982       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 983    } else {
 984       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 985       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 986       brw_set_src1(p, insn, brw_imm_ud(0));
 987       insn->bits3.break_cont.jip = 0;
 988       insn->bits3.break_cont.uip = 0;
 989    }
 990
 991    insn->header.execution_size = execute_size;
 992    insn->header.compression_control = BRW_COMPRESSION_NONE;
 993    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
 994    insn->header.mask_control = BRW_MASK_ENABLE;
 995    if (!p->single_program_flow)
 996       insn->header.thread_control = BRW_THREAD_SWITCH;
 997
 998    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 999
1000    push_if_stack(p, insn);
1001    p->if_depth_in_loop[p->loop_stack_depth]++;
1002    return insn;
1003 }
1004
1005 /* This function is only used for gen6-style IF instructions with an
1006  * embedded comparison (conditional modifier).  It is not used on gen7.
1007  */
1008 struct brw_instruction *
1009 gen6_IF(struct brw_compile *p, uint32_t conditional,
1010         struct brw_reg src0, struct brw_reg src1)
1011 {
1012    struct brw_instruction *insn;
1013
1014    insn = next_insn(p, BRW_OPCODE_IF);
1015
1016    brw_set_dest(p, insn, brw_imm_w(0));
1017    if (p->compressed) {
1018       insn->header.execution_size = BRW_EXECUTE_16;
1019    } else {
1020       insn->header.execution_size = BRW_EXECUTE_8;
1021    }
1022    insn->bits1.branch_gen6.jump_count = 0;
1023    brw_set_src0(p, insn, src0);
1024    brw_set_src1(p, insn, src1);
1025
1026    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1027    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1028    insn->header.destreg__conditionalmod = conditional;
1029
1030    if (!p->single_program_flow)
1031       insn->header.thread_control = BRW_THREAD_SWITCH;
1032
1033    push_if_stack(p, insn);
1034    return insn;
1035 }
1036
1037 /**
1038  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1039  */
1040 static void
1041 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1042                        struct brw_instruction *if_inst,
1043                        struct brw_instruction *else_inst)
1044 {
1045    /* The next instruction (where the ENDIF would be, if it existed) */
1046    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1047
1048    assert(p->single_program_flow);
1049    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1050    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1051    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1052
1053    /* Convert IF to an ADD instruction that moves the instruction pointer
1054     * to the first instruction of the ELSE block.  If there is no ELSE
1055     * block, point to where ENDIF would be.  Reverse the predicate.
1056     *
1057     * There's no need to execute an ENDIF since we don't need to do any
1058     * stack operations, and if we're currently executing, we just want to
1059     * continue normally.
1060     */
1061    if_inst->header.opcode = BRW_OPCODE_ADD;
1062    if_inst->header.predicate_inverse = 1;
1063
1064    if (else_inst != NULL) {
1065       /* Convert ELSE to an ADD instruction that points where the ENDIF
1066        * would be.
1067        */
1068       else_inst->header.opcode = BRW_OPCODE_ADD;
1069
1070       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1071       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1072    } else {
1073       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1074    }
1075 }
1076
1077 /**
1078  * Patch IF and ELSE instructions with appropriate jump targets.
1079  */
1080 static void
1081 patch_IF_ELSE(struct brw_compile *p,
1082               struct brw_instruction *if_inst,
1083               struct brw_instruction *else_inst,
1084               struct brw_instruction *endif_inst)
1085 {
1086    struct intel_context *intel = &p->brw->intel;
1087
1088    /* We shouldn't be patching IF and ELSE instructions in single program flow
1089     * mode when gen < 6, because in single program flow mode on those
1090     * platforms, we convert flow control instructions to conditional ADDs that
1091     * operate on IP (see brw_ENDIF).
1092     *
1093     * However, on Gen6, writing to IP doesn't work in single program flow mode
1094     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1095     * not be updated by non-flow control instructions.").  And on later
1096     * platforms, there is no significant benefit to converting control flow
1097     * instructions to conditional ADDs.  So we do patch IF and ELSE
1098     * instructions in single program flow mode on those platforms.
1099     */
1100    if (intel->gen < 6)
1101       assert(!p->single_program_flow);
1102
1103    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1104    assert(endif_inst != NULL);
1105    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1106
1107    unsigned br = 1;
1108    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1109     * requires 2 chunks.
1110     */
1111    if (intel->gen >= 5)
1112       br = 2;
1113
1114    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1115    endif_inst->header.execution_size = if_inst->header.execution_size;
1116
1117    if (else_inst == NULL) {
1118       /* Patch IF -> ENDIF */
1119       if (intel->gen < 6) {
1120          /* Turn it into an IFF, which means no mask stack operations for
1121           * all-false and jumping past the ENDIF.
1122           */
1123          if_inst->header.opcode = BRW_OPCODE_IFF;
1124          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1125          if_inst->bits3.if_else.pop_count = 0;
1126          if_inst->bits3.if_else.pad0 = 0;
1127       } else if (intel->gen == 6) {
1128          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1129          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1130       } else {
1131          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1132          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1133       }
1134    } else {
1135       else_inst->header.execution_size = if_inst->header.execution_size;
1136
1137       /* Patch IF -> ELSE */
1138       if (intel->gen < 6) {
1139          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1140          if_inst->bits3.if_else.pop_count = 0;
1141          if_inst->bits3.if_else.pad0 = 0;
1142       } else if (intel->gen == 6) {
1143          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1144       }
1145
1146       /* Patch ELSE -> ENDIF */
1147       if (intel->gen < 6) {
1148          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1149           * matching ENDIF.
1150           */
1151          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1152          else_inst->bits3.if_else.pop_count = 1;
1153          else_inst->bits3.if_else.pad0 = 0;
1154       } else if (intel->gen == 6) {
1155          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1156          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1157       } else {
1158          /* The IF instruction's JIP should point just past the ELSE */
1159          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1160          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1161          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1162          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1163       }
1164    }
1165 }
1166
1167 void
1168 brw_ELSE(struct brw_compile *p)
1169 {
1170    struct intel_context *intel = &p->brw->intel;
1171    struct brw_instruction *insn;
1172
1173    insn = next_insn(p, BRW_OPCODE_ELSE);
1174
1175    if (intel->gen < 6) {
1176       brw_set_dest(p, insn, brw_ip_reg());
1177       brw_set_src0(p, insn, brw_ip_reg());
1178       brw_set_src1(p, insn, brw_imm_d(0x0));
1179    } else if (intel->gen == 6) {
1180       brw_set_dest(p, insn, brw_imm_w(0));
1181       insn->bits1.branch_gen6.jump_count = 0;
1182       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1183       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1184    } else {
1185       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1186       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1187       brw_set_src1(p, insn, brw_imm_ud(0));
1188       insn->bits3.break_cont.jip = 0;
1189       insn->bits3.break_cont.uip = 0;
1190    }
1191
1192    insn->header.compression_control = BRW_COMPRESSION_NONE;
1193    insn->header.mask_control = BRW_MASK_ENABLE;
1194    if (!p->single_program_flow)
1195       insn->header.thread_control = BRW_THREAD_SWITCH;
1196
1197    push_if_stack(p, insn);
1198 }
1199
1200 void
1201 brw_ENDIF(struct brw_compile *p)
1202 {
1203    struct intel_context *intel = &p->brw->intel;
1204    struct brw_instruction *insn = NULL;
1205    struct brw_instruction *else_inst = NULL;
1206    struct brw_instruction *if_inst = NULL;
1207    struct brw_instruction *tmp;
1208    bool emit_endif = true;
1209
1210    /* In single program flow mode, we can express IF and ELSE instructions
1211     * equivalently as ADD instructions that operate on IP.  On platforms prior
1212     * to Gen6, flow control instructions cause an implied thread switch, so
1213     * this is a significant savings.
1214     *
1215     * However, on Gen6, writing to IP doesn't work in single program flow mode
1216     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1217     * not be updated by non-flow control instructions.").  And on later
1218     * platforms, there is no significant benefit to converting control flow
1219     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1220     * Gen5.
1221     */
1222    if (intel->gen < 6 && p->single_program_flow)
1223       emit_endif = false;
1224
1225    /*
1226     * A single next_insn() may change the base adress of instruction store
1227     * memory(p->store), so call it first before referencing the instruction
1228     * store pointer from an index
1229     */
1230    if (emit_endif)
1231       insn = next_insn(p, BRW_OPCODE_ENDIF);
1232
1233    /* Pop the IF and (optional) ELSE instructions from the stack */
1234    p->if_depth_in_loop[p->loop_stack_depth]--;
1235    tmp = pop_if_stack(p);
1236    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1237       else_inst = tmp;
1238       tmp = pop_if_stack(p);
1239    }
1240    if_inst = tmp;
1241
1242    if (!emit_endif) {
1243       /* ENDIF is useless; don't bother emitting it. */
1244       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1245       return;
1246    }
1247
1248    if (intel->gen < 6) {
1249       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1250       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1251       brw_set_src1(p, insn, brw_imm_d(0x0));
1252    } else if (intel->gen == 6) {
1253       brw_set_dest(p, insn, brw_imm_w(0));
1254       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1255       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1256    } else {
1257       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1258       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1259       brw_set_src1(p, insn, brw_imm_ud(0));
1260    }
1261
1262    insn->header.compression_control = BRW_COMPRESSION_NONE;
1263    insn->header.mask_control = BRW_MASK_ENABLE;
1264    insn->header.thread_control = BRW_THREAD_SWITCH;
1265
1266    /* Also pop item off the stack in the endif instruction: */
1267    if (intel->gen < 6) {
1268       insn->bits3.if_else.jump_count = 0;
1269       insn->bits3.if_else.pop_count = 1;
1270       insn->bits3.if_else.pad0 = 0;
1271    } else if (intel->gen == 6) {
1272       insn->bits1.branch_gen6.jump_count = 2;
1273    } else {
1274       insn->bits3.break_cont.jip = 2;
1275    }
1276    patch_IF_ELSE(p, if_inst, else_inst, insn);
1277 }
1278
1279 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1280 {
1281    struct intel_context *intel = &p->brw->intel;
1282    struct brw_instruction *insn;
1283
1284    insn = next_insn(p, BRW_OPCODE_BREAK);
1285    if (intel->gen >= 6) {
1286       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1287       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1288       brw_set_src1(p, insn, brw_imm_d(0x0));
1289    } else {
1290       brw_set_dest(p, insn, brw_ip_reg());
1291       brw_set_src0(p, insn, brw_ip_reg());
1292       brw_set_src1(p, insn, brw_imm_d(0x0));
1293       insn->bits3.if_else.pad0 = 0;
1294       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1295    }
1296    insn->header.compression_control = BRW_COMPRESSION_NONE;
1297    insn->header.execution_size = BRW_EXECUTE_8;
1298
1299    return insn;
1300 }
1301
1302 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1303 {
1304    struct brw_instruction *insn;
1305
1306    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1307    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1308    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1309    brw_set_dest(p, insn, brw_ip_reg());
1310    brw_set_src0(p, insn, brw_ip_reg());
1311    brw_set_src1(p, insn, brw_imm_d(0x0));
1312
1313    insn->header.compression_control = BRW_COMPRESSION_NONE;
1314    insn->header.execution_size = BRW_EXECUTE_8;
1315    return insn;
1316 }
1317
1318 struct brw_instruction *brw_CONT(struct brw_compile *p)
1319 {
1320    struct brw_instruction *insn;
1321    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1322    brw_set_dest(p, insn, brw_ip_reg());
1323    brw_set_src0(p, insn, brw_ip_reg());
1324    brw_set_src1(p, insn, brw_imm_d(0x0));
1325    insn->header.compression_control = BRW_COMPRESSION_NONE;
1326    insn->header.execution_size = BRW_EXECUTE_8;
1327    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1328    insn->bits3.if_else.pad0 = 0;
1329    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1330    return insn;
1331 }
1332
1333 /* DO/WHILE loop:
1334  *
1335  * The DO/WHILE is just an unterminated loop -- break or continue are
1336  * used for control within the loop.  We have a few ways they can be
1337  * done.
1338  *
1339  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1340  * jip and no DO instruction.
1341  *
1342  * For non-uniform control flow pre-gen6, there's a DO instruction to
1343  * push the mask, and a WHILE to jump back, and BREAK to get out and
1344  * pop the mask.
1345  *
1346  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1347  * just points back to the first instruction of the loop.
1348  */
1349 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1350 {
1351    struct intel_context *intel = &p->brw->intel;
1352
1353    if (intel->gen >= 6 || p->single_program_flow) {
1354       push_loop_stack(p, &p->store[p->nr_insn]);
1355       return &p->store[p->nr_insn];
1356    } else {
1357       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1358
1359       push_loop_stack(p, insn);
1360
1361       /* Override the defaults for this instruction:
1362        */
1363       brw_set_dest(p, insn, brw_null_reg());
1364       brw_set_src0(p, insn, brw_null_reg());
1365       brw_set_src1(p, insn, brw_null_reg());
1366
1367       insn->header.compression_control = BRW_COMPRESSION_NONE;
1368       insn->header.execution_size = execute_size;
1369       insn->header.predicate_control = BRW_PREDICATE_NONE;
1370       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1371       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1372
1373       return insn;
1374    }
1375 }
1376
1377 /**
1378  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1379  * instruction here.
1380  *
1381  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1382  * nesting, since it can always just point to the end of the block/current loop.
1383  */
1384 static void
1385 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1386 {
1387    struct intel_context *intel = &p->brw->intel;
1388    struct brw_instruction *do_inst = get_inner_do_insn(p);
1389    struct brw_instruction *inst;
1390    int br = (intel->gen == 5) ? 2 : 1;
1391
1392    for (inst = while_inst - 1; inst != do_inst; inst--) {
1393       /* If the jump count is != 0, that means that this instruction has already
1394        * been patched because it's part of a loop inside of the one we're
1395        * patching.
1396        */
1397       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1398           inst->bits3.if_else.jump_count == 0) {
1399          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1400       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1401                  inst->bits3.if_else.jump_count == 0) {
1402          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1403       }
1404    }
1405 }
1406
1407 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1408 {
1409    struct intel_context *intel = &p->brw->intel;
1410    struct brw_instruction *insn, *do_insn;
1411    GLuint br = 1;
1412
1413    if (intel->gen >= 5)
1414       br = 2;
1415
1416    if (intel->gen >= 7) {
1417       insn = next_insn(p, BRW_OPCODE_WHILE);
1418       do_insn = get_inner_do_insn(p);
1419
1420       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1421       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1422       brw_set_src1(p, insn, brw_imm_ud(0));
1423       insn->bits3.break_cont.jip = br * (do_insn - insn);
1424
1425       insn->header.execution_size = BRW_EXECUTE_8;
1426    } else if (intel->gen == 6) {
1427       insn = next_insn(p, BRW_OPCODE_WHILE);
1428       do_insn = get_inner_do_insn(p);
1429
1430       brw_set_dest(p, insn, brw_imm_w(0));
1431       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1432       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1433       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1434
1435       insn->header.execution_size = BRW_EXECUTE_8;
1436    } else {
1437       if (p->single_program_flow) {
1438          insn = next_insn(p, BRW_OPCODE_ADD);
1439          do_insn = get_inner_do_insn(p);
1440
1441          brw_set_dest(p, insn, brw_ip_reg());
1442          brw_set_src0(p, insn, brw_ip_reg());
1443          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1444          insn->header.execution_size = BRW_EXECUTE_1;
1445       } else {
1446          insn = next_insn(p, BRW_OPCODE_WHILE);
1447          do_insn = get_inner_do_insn(p);
1448
1449          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1450
1451          brw_set_dest(p, insn, brw_ip_reg());
1452          brw_set_src0(p, insn, brw_ip_reg());
1453          brw_set_src1(p, insn, brw_imm_d(0));
1454
1455          insn->header.execution_size = do_insn->header.execution_size;
1456          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1457          insn->bits3.if_else.pop_count = 0;
1458          insn->bits3.if_else.pad0 = 0;
1459
1460          brw_patch_break_cont(p, insn);
1461       }
1462    }
1463    insn->header.compression_control = BRW_COMPRESSION_NONE;
1464    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1465
1466    p->loop_stack_depth--;
1467
1468    return insn;
1469 }
1470
1471
1472 /* FORWARD JUMPS:
1473  */
1474 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1475 {
1476    struct intel_context *intel = &p->brw->intel;
1477    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1478    GLuint jmpi = 1;
1479
1480    if (intel->gen >= 5)
1481       jmpi = 2;
1482
1483    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1484    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1485
1486    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1487 }
1488
1489
1490
1491 /* To integrate with the above, it makes sense that the comparison
1492  * instruction should populate the flag register.  It might be simpler
1493  * just to use the flag reg for most WM tasks?
1494  */
1495 void brw_CMP(struct brw_compile *p,
1496              struct brw_reg dest,
1497              GLuint conditional,
1498              struct brw_reg src0,
1499              struct brw_reg src1)
1500 {
1501    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1502
1503    insn->header.destreg__conditionalmod = conditional;
1504    brw_set_dest(p, insn, dest);
1505    brw_set_src0(p, insn, src0);
1506    brw_set_src1(p, insn, src1);
1507
1508 /*    guess_execution_size(insn, src0); */
1509
1510
1511    /* Make it so that future instructions will use the computed flag
1512     * value until brw_set_predicate_control_flag_value() is called
1513     * again.
1514     */
1515    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1516        dest.nr == 0) {
1517       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1518       p->flag_value = 0xff;
1519    }
1520 }
1521
1522 /* Issue 'wait' instruction for n1, host could program MMIO
1523    to wake up thread. */
1524 void brw_WAIT (struct brw_compile *p)
1525 {
1526    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1527    struct brw_reg src = brw_notification_1_reg();
1528
1529    brw_set_dest(p, insn, src);
1530    brw_set_src0(p, insn, src);
1531    brw_set_src1(p, insn, brw_null_reg());
1532    insn->header.execution_size = 0; /* must */
1533    insn->header.predicate_control = 0;
1534    insn->header.compression_control = 0;
1535 }
1536
1537
1538 /***********************************************************************
1539  * Helpers for the various SEND message types:
1540  */
1541
1542 /** Extended math function, float[8].
1543  */
1544 void brw_math( struct brw_compile *p,
1545                struct brw_reg dest,
1546                GLuint function,
1547                GLuint saturate,
1548                GLuint msg_reg_nr,
1549                struct brw_reg src,
1550                GLuint data_type,
1551                GLuint precision )
1552 {
1553    struct intel_context *intel = &p->brw->intel;
1554
1555    if (intel->gen >= 6) {
1556       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1557
1558       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1559       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1560
1561       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1562       if (intel->gen == 6)
1563          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1564
1565       /* Source modifiers are ignored for extended math instructions on Gen6. */
1566       if (intel->gen == 6) {
1567          assert(!src.negate);
1568          assert(!src.abs);
1569       }
1570
1571       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1572           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1573           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1574          assert(src.type != BRW_REGISTER_TYPE_F);
1575       } else {
1576          assert(src.type == BRW_REGISTER_TYPE_F);
1577       }
1578
1579       /* Math is the same ISA format as other opcodes, except that CondModifier
1580        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1581        */
1582       insn->header.destreg__conditionalmod = function;
1583       insn->header.saturate = saturate;
1584
1585       brw_set_dest(p, insn, dest);
1586       brw_set_src0(p, insn, src);
1587       brw_set_src1(p, insn, brw_null_reg());
1588    } else {
1589       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1590
1591       /* Example code doesn't set predicate_control for send
1592        * instructions.
1593        */
1594       insn->header.predicate_control = 0;
1595       insn->header.destreg__conditionalmod = msg_reg_nr;
1596
1597       brw_set_dest(p, insn, dest);
1598       brw_set_src0(p, insn, src);
1599       brw_set_math_message(p,
1600                            insn,
1601                            function,
1602                            src.type == BRW_REGISTER_TYPE_D,
1603                            precision,
1604                            saturate,
1605                            data_type);
1606    }
1607 }
1608
1609 /** Extended math function, float[8].
1610  */
1611 void brw_math2(struct brw_compile *p,
1612                struct brw_reg dest,
1613                GLuint function,
1614                struct brw_reg src0,
1615                struct brw_reg src1)
1616 {
1617    struct intel_context *intel = &p->brw->intel;
1618    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1619
1620    assert(intel->gen >= 6);
1621    (void) intel;
1622
1623
1624    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1625    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1626    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1627
1628    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1629    if (intel->gen == 6) {
1630       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1631       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1632    }
1633
1634    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1635        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1636        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1637       assert(src0.type != BRW_REGISTER_TYPE_F);
1638       assert(src1.type != BRW_REGISTER_TYPE_F);
1639    } else {
1640       assert(src0.type == BRW_REGISTER_TYPE_F);
1641       assert(src1.type == BRW_REGISTER_TYPE_F);
1642    }
1643
1644    /* Source modifiers are ignored for extended math instructions on Gen6. */
1645    if (intel->gen == 6) {
1646       assert(!src0.negate);
1647       assert(!src0.abs);
1648       assert(!src1.negate);
1649       assert(!src1.abs);
1650    }
1651
1652    /* Math is the same ISA format as other opcodes, except that CondModifier
1653     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1654     */
1655    insn->header.destreg__conditionalmod = function;
1656
1657    brw_set_dest(p, insn, dest);
1658    brw_set_src0(p, insn, src0);
1659    brw_set_src1(p, insn, src1);
1660 }
1661
1662 /**
1663  * Extended math function, float[16].
1664  * Use 2 send instructions.
1665  */
1666 void brw_math_16( struct brw_compile *p,
1667                   struct brw_reg dest,
1668                   GLuint function,
1669                   GLuint saturate,
1670                   GLuint msg_reg_nr,
1671                   struct brw_reg src,
1672                   GLuint precision )
1673 {
1674    struct intel_context *intel = &p->brw->intel;
1675    struct brw_instruction *insn;
1676
1677    if (intel->gen >= 6) {
1678       insn = next_insn(p, BRW_OPCODE_MATH);
1679
1680       /* Math is the same ISA format as other opcodes, except that CondModifier
1681        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1682        */
1683       insn->header.destreg__conditionalmod = function;
1684       insn->header.saturate = saturate;
1685
1686       /* Source modifiers are ignored for extended math instructions. */
1687       assert(!src.negate);
1688       assert(!src.abs);
1689
1690       brw_set_dest(p, insn, dest);
1691       brw_set_src0(p, insn, src);
1692       brw_set_src1(p, insn, brw_null_reg());
1693       return;
1694    }
1695
1696    /* First instruction:
1697     */
1698    brw_push_insn_state(p);
1699    brw_set_predicate_control_flag_value(p, 0xff);
1700    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1701
1702    insn = next_insn(p, BRW_OPCODE_SEND);
1703    insn->header.destreg__conditionalmod = msg_reg_nr;
1704
1705    brw_set_dest(p, insn, dest);
1706    brw_set_src0(p, insn, src);
1707    brw_set_math_message(p,
1708                         insn,
1709                         function,
1710                         BRW_MATH_INTEGER_UNSIGNED,
1711                         precision,
1712                         saturate,
1713                         BRW_MATH_DATA_VECTOR);
1714
1715    /* Second instruction:
1716     */
1717    insn = next_insn(p, BRW_OPCODE_SEND);
1718    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1719    insn->header.destreg__conditionalmod = msg_reg_nr+1;
1720
1721    brw_set_dest(p, insn, offset(dest,1));
1722    brw_set_src0(p, insn, src);
1723    brw_set_math_message(p,
1724                         insn,
1725                         function,
1726                         BRW_MATH_INTEGER_UNSIGNED,
1727                         precision,
1728                         saturate,
1729                         BRW_MATH_DATA_VECTOR);
1730
1731    brw_pop_insn_state(p);
1732 }
1733
1734
1735 /**
1736  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1737  * using a constant offset per channel.
1738  *
1739  * The offset must be aligned to oword size (16 bytes).  Used for
1740  * register spilling.
1741  */
1742 void brw_oword_block_write_scratch(struct brw_compile *p,
1743                                    struct brw_reg mrf,
1744                                    int num_regs,
1745                                    GLuint offset)
1746 {
1747    struct intel_context *intel = &p->brw->intel;
1748    uint32_t msg_control, msg_type;
1749    int mlen;
1750
1751    if (intel->gen >= 6)
1752       offset /= 16;
1753
1754    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1755
1756    if (num_regs == 1) {
1757       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1758       mlen = 2;
1759    } else {
1760       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1761       mlen = 3;
1762    }
1763
1764    /* Set up the message header.  This is g0, with g0.2 filled with
1765     * the offset.  We don't want to leave our offset around in g0 or
1766     * it'll screw up texture samples, so set it up inside the message
1767     * reg.
1768     */
1769    {
1770       brw_push_insn_state(p);
1771       brw_set_mask_control(p, BRW_MASK_DISABLE);
1772       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1773
1774       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1775
1776       /* set message header global offset field (reg 0, element 2) */
1777       brw_MOV(p,
1778               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1779                                   mrf.nr,
1780                                   2), BRW_REGISTER_TYPE_UD),
1781               brw_imm_ud(offset));
1782
1783       brw_pop_insn_state(p);
1784    }
1785
1786    {
1787       struct brw_reg dest;
1788       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1789       int send_commit_msg;
1790       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1791                                          BRW_REGISTER_TYPE_UW);
1792
1793       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1794          insn->header.compression_control = BRW_COMPRESSION_NONE;
1795          src_header = vec16(src_header);
1796       }
1797       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1798       insn->header.destreg__conditionalmod = mrf.nr;
1799
1800       /* Until gen6, writes followed by reads from the same location
1801        * are not guaranteed to be ordered unless write_commit is set.
1802        * If set, then a no-op write is issued to the destination
1803        * register to set a dependency, and a read from the destination
1804        * can be used to ensure the ordering.
1805        *
1806        * For gen6, only writes between different threads need ordering
1807        * protection.  Our use of DP writes is all about register
1808        * spilling within a thread.
1809        */
1810       if (intel->gen >= 6) {
1811          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1812          send_commit_msg = 0;
1813       } else {
1814          dest = src_header;
1815          send_commit_msg = 1;
1816       }
1817
1818       brw_set_dest(p, insn, dest);
1819       if (intel->gen >= 6) {
1820          brw_set_src0(p, insn, mrf);
1821       } else {
1822          brw_set_src0(p, insn, brw_null_reg());
1823       }
1824
1825       if (intel->gen >= 6)
1826          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1827       else
1828          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1829
1830       brw_set_dp_write_message(p,
1831                                insn,
1832                                255, /* binding table index (255=stateless) */
1833                                msg_control,
1834                                msg_type,
1835                                mlen,
1836                                true, /* header_present */
1837                                0, /* not a render target */
1838                                send_commit_msg, /* response_length */
1839                                0, /* eot */
1840                                send_commit_msg);
1841    }
1842 }
1843
1844
1845 /**
1846  * Read a block of owords (half a GRF each) from the scratch buffer
1847  * using a constant index per channel.
1848  *
1849  * Offset must be aligned to oword size (16 bytes).  Used for register
1850  * spilling.
1851  */
1852 void
1853 brw_oword_block_read_scratch(struct brw_compile *p,
1854                              struct brw_reg dest,
1855                              struct brw_reg mrf,
1856                              int num_regs,
1857                              GLuint offset)
1858 {
1859    struct intel_context *intel = &p->brw->intel;
1860    uint32_t msg_control;
1861    int rlen;
1862
1863    if (intel->gen >= 6)
1864       offset /= 16;
1865
1866    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1867    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1868
1869    if (num_regs == 1) {
1870       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1871       rlen = 1;
1872    } else {
1873       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1874       rlen = 2;
1875    }
1876
1877    {
1878       brw_push_insn_state(p);
1879       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1880       brw_set_mask_control(p, BRW_MASK_DISABLE);
1881
1882       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1883
1884       /* set message header global offset field (reg 0, element 2) */
1885       brw_MOV(p,
1886               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1887                                   mrf.nr,
1888                                   2), BRW_REGISTER_TYPE_UD),
1889               brw_imm_ud(offset));
1890
1891       brw_pop_insn_state(p);
1892    }
1893
1894    {
1895       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1896
1897       assert(insn->header.predicate_control == 0);
1898       insn->header.compression_control = BRW_COMPRESSION_NONE;
1899       insn->header.destreg__conditionalmod = mrf.nr;
1900
1901       brw_set_dest(p, insn, dest);      /* UW? */
1902       if (intel->gen >= 6) {
1903          brw_set_src0(p, insn, mrf);
1904       } else {
1905          brw_set_src0(p, insn, brw_null_reg());
1906       }
1907
1908       brw_set_dp_read_message(p,
1909                               insn,
1910                               255, /* binding table index (255=stateless) */
1911                               msg_control,
1912                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1913                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1914                               1, /* msg_length */
1915                               rlen);
1916    }
1917 }
1918
1919 /**
1920  * Read a float[4] vector from the data port Data Cache (const buffer).
1921  * Location (in buffer) should be a multiple of 16.
1922  * Used for fetching shader constants.
1923  */
1924 void brw_oword_block_read(struct brw_compile *p,
1925                           struct brw_reg dest,
1926                           struct brw_reg mrf,
1927                           uint32_t offset,
1928                           uint32_t bind_table_index)
1929 {
1930    struct intel_context *intel = &p->brw->intel;
1931
1932    /* On newer hardware, offset is in units of owords. */
1933    if (intel->gen >= 6)
1934       offset /= 16;
1935
1936    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1937
1938    brw_push_insn_state(p);
1939    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1940    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1941    brw_set_mask_control(p, BRW_MASK_DISABLE);
1942
1943    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1944
1945    /* set message header global offset field (reg 0, element 2) */
1946    brw_MOV(p,
1947            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1948                                mrf.nr,
1949                                2), BRW_REGISTER_TYPE_UD),
1950            brw_imm_ud(offset));
1951
1952    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1953    insn->header.destreg__conditionalmod = mrf.nr;
1954
1955    /* cast dest to a uword[8] vector */
1956    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1957
1958    brw_set_dest(p, insn, dest);
1959    if (intel->gen >= 6) {
1960       brw_set_src0(p, insn, mrf);
1961    } else {
1962       brw_set_src0(p, insn, brw_null_reg());
1963    }
1964
1965    brw_set_dp_read_message(p,
1966                            insn,
1967                            bind_table_index,
1968                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1969                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1970                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1971                            1, /* msg_length */
1972                            1); /* response_length (1 reg, 2 owords!) */
1973
1974    brw_pop_insn_state(p);
1975 }
1976
1977 /**
1978  * Read a set of dwords from the data port Data Cache (const buffer).
1979  *
1980  * Location (in buffer) appears as UD offsets in the register after
1981  * the provided mrf header reg.
1982  */
1983 void brw_dword_scattered_read(struct brw_compile *p,
1984                               struct brw_reg dest,
1985                               struct brw_reg mrf,
1986                               uint32_t bind_table_index)
1987 {
1988    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1989
1990    brw_push_insn_state(p);
1991    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1992    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1993    brw_set_mask_control(p, BRW_MASK_DISABLE);
1994    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1995    brw_pop_insn_state(p);
1996
1997    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1998    insn->header.destreg__conditionalmod = mrf.nr;
1999
2000    /* cast dest to a uword[8] vector */
2001    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2002
2003    brw_set_dest(p, insn, dest);
2004    brw_set_src0(p, insn, brw_null_reg());
2005
2006    brw_set_dp_read_message(p,
2007                            insn,
2008                            bind_table_index,
2009                            BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2010                            BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2011                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2012                            2, /* msg_length */
2013                            1); /* response_length */
2014 }
2015
2016
2017
2018 /**
2019  * Read float[4] constant(s) from VS constant buffer.
2020  * For relative addressing, two float[4] constants will be read into 'dest'.
2021  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2022  */
2023 void brw_dp_READ_4_vs(struct brw_compile *p,
2024                       struct brw_reg dest,
2025                       GLuint location,
2026                       GLuint bind_table_index)
2027 {
2028    struct intel_context *intel = &p->brw->intel;
2029    struct brw_instruction *insn;
2030    GLuint msg_reg_nr = 1;
2031
2032    if (intel->gen >= 6)
2033       location /= 16;
2034
2035    /* Setup MRF[1] with location/offset into const buffer */
2036    brw_push_insn_state(p);
2037    brw_set_access_mode(p, BRW_ALIGN_1);
2038    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2039    brw_set_mask_control(p, BRW_MASK_DISABLE);
2040    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2041    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2042                      BRW_REGISTER_TYPE_UD),
2043            brw_imm_ud(location));
2044    brw_pop_insn_state(p);
2045
2046    insn = next_insn(p, BRW_OPCODE_SEND);
2047
2048    insn->header.predicate_control = BRW_PREDICATE_NONE;
2049    insn->header.compression_control = BRW_COMPRESSION_NONE;
2050    insn->header.destreg__conditionalmod = msg_reg_nr;
2051    insn->header.mask_control = BRW_MASK_DISABLE;
2052
2053    brw_set_dest(p, insn, dest);
2054    if (intel->gen >= 6) {
2055       brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2056    } else {
2057       brw_set_src0(p, insn, brw_null_reg());
2058    }
2059
2060    brw_set_dp_read_message(p,
2061                            insn,
2062                            bind_table_index,
2063                            0,
2064                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2065                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2066                            1, /* msg_length */
2067                            1); /* response_length (1 Oword) */
2068 }
2069
2070 /**
2071  * Read a float[4] constant per vertex from VS constant buffer, with
2072  * relative addressing.
2073  */
2074 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2075                                struct brw_reg dest,
2076                                struct brw_reg addr_reg,
2077                                GLuint offset,
2078                                GLuint bind_table_index)
2079 {
2080    struct intel_context *intel = &p->brw->intel;
2081    struct brw_reg src = brw_vec8_grf(0, 0);
2082    int msg_type;
2083
2084    /* Setup MRF[1] with offset into const buffer */
2085    brw_push_insn_state(p);
2086    brw_set_access_mode(p, BRW_ALIGN_1);
2087    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2088    brw_set_mask_control(p, BRW_MASK_DISABLE);
2089    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2090
2091    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2092     * fields ignored.
2093     */
2094    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2095            addr_reg, brw_imm_d(offset));
2096    brw_pop_insn_state(p);
2097
2098    gen6_resolve_implied_move(p, &src, 0);
2099    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2100
2101    insn->header.predicate_control = BRW_PREDICATE_NONE;
2102    insn->header.compression_control = BRW_COMPRESSION_NONE;
2103    insn->header.destreg__conditionalmod = 0;
2104    insn->header.mask_control = BRW_MASK_DISABLE;
2105
2106    brw_set_dest(p, insn, dest);
2107    brw_set_src0(p, insn, src);
2108
2109    if (intel->gen >= 6)
2110       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2111    else if (intel->gen == 5 || intel->is_g4x)
2112       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2113    else
2114       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2115
2116    brw_set_dp_read_message(p,
2117                            insn,
2118                            bind_table_index,
2119                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2120                            msg_type,
2121                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2122                            2, /* msg_length */
2123                            1); /* response_length */
2124 }
2125
2126
2127
2128 void brw_fb_WRITE(struct brw_compile *p,
2129                   int dispatch_width,
2130                   GLuint msg_reg_nr,
2131                   struct brw_reg src0,
2132                   GLuint binding_table_index,
2133                   GLuint msg_length,
2134                   GLuint response_length,
2135                   bool eot,
2136                   bool header_present)
2137 {
2138    struct intel_context *intel = &p->brw->intel;
2139    struct brw_instruction *insn;
2140    GLuint msg_control, msg_type;
2141    struct brw_reg dest;
2142
2143    if (dispatch_width == 16)
2144       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2145    else
2146       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2147
2148    if (intel->gen >= 6 && binding_table_index == 0) {
2149       insn = next_insn(p, BRW_OPCODE_SENDC);
2150    } else {
2151       insn = next_insn(p, BRW_OPCODE_SEND);
2152    }
2153    /* The execution mask is ignored for render target writes. */
2154    insn->header.predicate_control = 0;
2155    insn->header.compression_control = BRW_COMPRESSION_NONE;
2156
2157    if (intel->gen >= 6) {
2158       /* headerless version, just submit color payload */
2159       src0 = brw_message_reg(msg_reg_nr);
2160
2161       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2162    } else {
2163       insn->header.destreg__conditionalmod = msg_reg_nr;
2164
2165       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2166    }
2167
2168    if (dispatch_width == 16)
2169       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2170    else
2171       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2172
2173    brw_set_dest(p, insn, dest);
2174    brw_set_src0(p, insn, src0);
2175    brw_set_dp_write_message(p,
2176                             insn,
2177                             binding_table_index,
2178                             msg_control,
2179                             msg_type,
2180                             msg_length,
2181                             header_present,
2182                             1, /* last render target write */
2183                             response_length,
2184                             eot,
2185                             0 /* send_commit_msg */);
2186 }
2187
2188
2189 /**
2190  * Texture sample instruction.
2191  * Note: the msg_type plus msg_length values determine exactly what kind
2192  * of sampling operation is performed.  See volume 4, page 161 of docs.
2193  */
2194 void brw_SAMPLE(struct brw_compile *p,
2195                 struct brw_reg dest,
2196                 GLuint msg_reg_nr,
2197                 struct brw_reg src0,
2198                 GLuint binding_table_index,
2199                 GLuint sampler,
2200                 GLuint writemask,
2201                 GLuint msg_type,
2202                 GLuint response_length,
2203                 GLuint msg_length,
2204                 GLuint header_present,
2205                 GLuint simd_mode,
2206                 GLuint return_format)
2207 {
2208    struct intel_context *intel = &p->brw->intel;
2209    bool need_stall = 0;
2210
2211    if (writemask == 0) {
2212       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2213       return;
2214    }
2215
2216    /* Hardware doesn't do destination dependency checking on send
2217     * instructions properly.  Add a workaround which generates the
2218     * dependency by other means.  In practice it seems like this bug
2219     * only crops up for texture samples, and only where registers are
2220     * written by the send and then written again later without being
2221     * read in between.  Luckily for us, we already track that
2222     * information and use it to modify the writemask for the
2223     * instruction, so that is a guide for whether a workaround is
2224     * needed.
2225     */
2226    if (writemask != WRITEMASK_XYZW) {
2227       GLuint dst_offset = 0;
2228       GLuint i, newmask = 0, len = 0;
2229
2230       for (i = 0; i < 4; i++) {
2231          if (writemask & (1<<i))
2232             break;
2233          dst_offset += 2;
2234       }
2235       for (; i < 4; i++) {
2236          if (!(writemask & (1<<i)))
2237             break;
2238          newmask |= 1<<i;
2239          len++;
2240       }
2241
2242       if (newmask != writemask) {
2243          need_stall = 1;
2244          /* printf("need stall %x %x\n", newmask , writemask); */
2245       }
2246       else {
2247          bool dispatch_16 = false;
2248
2249          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2250
2251          guess_execution_size(p, p->current, dest);
2252          if (p->current->header.execution_size == BRW_EXECUTE_16)
2253             dispatch_16 = true;
2254
2255          newmask = ~newmask & WRITEMASK_XYZW;
2256
2257          brw_push_insn_state(p);
2258
2259          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2260          brw_set_mask_control(p, BRW_MASK_DISABLE);
2261
2262          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2263                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2264          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2265
2266          brw_pop_insn_state(p);
2267
2268          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2269          dest = offset(dest, dst_offset);
2270
2271          /* For 16-wide dispatch, masked channels are skipped in the
2272           * response.  For 8-wide, masked channels still take up slots,
2273           * and are just not written to.
2274           */
2275          if (dispatch_16)
2276             response_length = len * 2;
2277       }
2278    }
2279
2280    {
2281       struct brw_instruction *insn;
2282
2283       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2284
2285       insn = next_insn(p, BRW_OPCODE_SEND);
2286       insn->header.predicate_control = 0; /* XXX */
2287       insn->header.compression_control = BRW_COMPRESSION_NONE;
2288       if (intel->gen < 6)
2289           insn->header.destreg__conditionalmod = msg_reg_nr;
2290
2291       brw_set_dest(p, insn, dest);
2292       brw_set_src0(p, insn, src0);
2293       brw_set_sampler_message(p, insn,
2294                               binding_table_index,
2295                               sampler,
2296                               msg_type,
2297                               response_length,
2298                               msg_length,
2299                               header_present,
2300                               simd_mode,
2301                               return_format);
2302    }
2303
2304    if (need_stall) {
2305       struct brw_reg reg = vec8(offset(dest, response_length-1));
2306
2307       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2308        */
2309       brw_push_insn_state(p);
2310       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2311       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2312               retype(reg, BRW_REGISTER_TYPE_UD));
2313       brw_pop_insn_state(p);
2314    }
2315
2316 }
2317
2318 /* All these variables are pretty confusing - we might be better off
2319  * using bitmasks and macros for this, in the old style.  Or perhaps
2320  * just having the caller instantiate the fields in dword3 itself.
2321  */
2322 void brw_urb_WRITE(struct brw_compile *p,
2323                    struct brw_reg dest,
2324                    GLuint msg_reg_nr,
2325                    struct brw_reg src0,
2326                    bool allocate,
2327                    bool used,
2328                    GLuint msg_length,
2329                    GLuint response_length,
2330                    bool eot,
2331                    bool writes_complete,
2332                    GLuint offset,
2333                    GLuint swizzle)
2334 {
2335    struct intel_context *intel = &p->brw->intel;
2336    struct brw_instruction *insn;
2337
2338    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2339
2340    if (intel->gen == 7) {
2341       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2342       brw_push_insn_state(p);
2343       brw_set_access_mode(p, BRW_ALIGN_1);
2344       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2345                        BRW_REGISTER_TYPE_UD),
2346                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2347                 brw_imm_ud(0xff00));
2348       brw_pop_insn_state(p);
2349    }
2350
2351    insn = next_insn(p, BRW_OPCODE_SEND);
2352
2353    assert(msg_length < BRW_MAX_MRF);
2354
2355    brw_set_dest(p, insn, dest);
2356    brw_set_src0(p, insn, src0);
2357    brw_set_src1(p, insn, brw_imm_d(0));
2358
2359    if (intel->gen < 6)
2360       insn->header.destreg__conditionalmod = msg_reg_nr;
2361
2362    brw_set_urb_message(p,
2363                        insn,
2364                        allocate,
2365                        used,
2366                        msg_length,
2367                        response_length,
2368                        eot,
2369                        writes_complete,
2370                        offset,
2371                        swizzle);
2372 }
2373
2374 static int
2375 brw_find_next_block_end(struct brw_compile *p, int start)
2376 {
2377    int ip;
2378
2379    for (ip = start + 1; ip < p->nr_insn; ip++) {
2380       struct brw_instruction *insn = &p->store[ip];
2381
2382       switch (insn->header.opcode) {
2383       case BRW_OPCODE_ENDIF:
2384       case BRW_OPCODE_ELSE:
2385       case BRW_OPCODE_WHILE:
2386          return ip;
2387       }
2388    }
2389    assert(!"not reached");
2390    return start + 1;
2391 }
2392
2393 /* There is no DO instruction on gen6, so to find the end of the loop
2394  * we have to see if the loop is jumping back before our start
2395  * instruction.
2396  */
2397 static int
2398 brw_find_loop_end(struct brw_compile *p, int start)
2399 {
2400    struct intel_context *intel = &p->brw->intel;
2401    int ip;
2402    int br = 2;
2403
2404    for (ip = start + 1; ip < p->nr_insn; ip++) {
2405       struct brw_instruction *insn = &p->store[ip];
2406
2407       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2408          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2409                                    : insn->bits3.break_cont.jip;
2410          if (ip + jip / br <= start)
2411             return ip;
2412       }
2413    }
2414    assert(!"not reached");
2415    return start + 1;
2416 }
2417
2418 /* After program generation, go back and update the UIP and JIP of
2419  * BREAK and CONT instructions to their correct locations.
2420  */
2421 void
2422 brw_set_uip_jip(struct brw_compile *p)
2423 {
2424    struct intel_context *intel = &p->brw->intel;
2425    int ip;
2426    int br = 2;
2427
2428    if (intel->gen < 6)
2429       return;
2430
2431    for (ip = 0; ip < p->nr_insn; ip++) {
2432       struct brw_instruction *insn = &p->store[ip];
2433
2434       switch (insn->header.opcode) {
2435       case BRW_OPCODE_BREAK:
2436          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2437          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2438          insn->bits3.break_cont.uip =
2439             br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2440          break;
2441       case BRW_OPCODE_CONTINUE:
2442          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2443          insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2444
2445          assert(insn->bits3.break_cont.uip != 0);
2446          assert(insn->bits3.break_cont.jip != 0);
2447          break;
2448       }
2449    }
2450 }
2451
2452 void brw_ff_sync(struct brw_compile *p,
2453                    struct brw_reg dest,
2454                    GLuint msg_reg_nr,
2455                    struct brw_reg src0,
2456                    bool allocate,
2457                    GLuint response_length,
2458                    bool eot)
2459 {
2460    struct intel_context *intel = &p->brw->intel;
2461    struct brw_instruction *insn;
2462
2463    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2464
2465    insn = next_insn(p, BRW_OPCODE_SEND);
2466    brw_set_dest(p, insn, dest);
2467    brw_set_src0(p, insn, src0);
2468    brw_set_src1(p, insn, brw_imm_d(0));
2469
2470    if (intel->gen < 6)
2471       insn->header.destreg__conditionalmod = msg_reg_nr;
2472
2473    brw_set_ff_sync_message(p,
2474                            insn,
2475                            allocate,
2476                            response_length,
2477                            eot);
2478 }
2479
2480 /**
2481  * Emit the SEND instruction necessary to generate stream output data on Gen6
2482  * (for transform feedback).
2483  *
2484  * If send_commit_msg is true, this is the last piece of stream output data
2485  * from this thread, so send the data as a committed write.  According to the
2486  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2487  *
2488  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2489  *   writes are complete by sending the final write as a committed write."
2490  */
2491 void
2492 brw_svb_write(struct brw_compile *p,
2493               struct brw_reg dest,
2494               GLuint msg_reg_nr,
2495               struct brw_reg src0,
2496               GLuint binding_table_index,
2497               bool   send_commit_msg)
2498 {
2499    struct brw_instruction *insn;
2500
2501    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2502
2503    insn = next_insn(p, BRW_OPCODE_SEND);
2504    brw_set_dest(p, insn, dest);
2505    brw_set_src0(p, insn, src0);
2506    brw_set_src1(p, insn, brw_imm_d(0));
2507    brw_set_dp_write_message(p, insn,
2508                             binding_table_index,
2509                             0, /* msg_control: ignored */
2510                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2511                             1, /* msg_length */
2512                             true, /* header_present */
2513                             0, /* last_render_target: ignored */
2514                             send_commit_msg, /* response_length */
2515                             0, /* end_of_thread */
2516                             send_commit_msg); /* send_commit_msg */
2517 }