src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    struct intel_context *intel = &p->brw->intel;
  88    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  89       reg->file = BRW_GENERAL_REGISTER_FILE;
  90       reg->nr += 111;
  91    }
  92 }
  93
  94
  95 void
  96 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
  97              struct brw_reg dest)
  98 {
  99    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 100        dest.file != BRW_MESSAGE_REGISTER_FILE)
 101       assert(dest.nr < 128);
 102
 103    gen7_convert_mrf_to_grf(p, &dest);
 104
 105    insn->bits1.da1.dest_reg_file = dest.file;
 106    insn->bits1.da1.dest_reg_type = dest.type;
 107    insn->bits1.da1.dest_address_mode = dest.address_mode;
 108
 109    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 110       insn->bits1.da1.dest_reg_nr = dest.nr;
 111
 112       if (insn->header.access_mode == BRW_ALIGN_1) {
 113          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 114          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 115             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 116          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 117       }
 118       else {
 119          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 120          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 121          /* even ignored in da16, still need to set as '01' */
 122          insn->bits1.da16.dest_horiz_stride = 1;
 123       }
 124    }
 125    else {
 126       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 127
 128       /* These are different sizes in align1 vs align16:
 129        */
 130       if (insn->header.access_mode == BRW_ALIGN_1) {
 131          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 132          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 133             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 134          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 135       }
 136       else {
 137          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 138          /* even ignored in da16, still need to set as '01' */
 139          insn->bits1.ia16.dest_horiz_stride = 1;
 140       }
 141    }
 142
 143    /* NEW: Set the execution size based on dest.width and
 144     * insn->compression_control:
 145     */
 146    guess_execution_size(p, insn, dest);
 147 }
 148
 149 extern int reg_type_size[];
 150
 151 static void
 152 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 153 {
 154    int hstride_for_reg[] = {0, 1, 2, 4};
 155    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 156    int width_for_reg[] = {1, 2, 4, 8, 16};
 157    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 158    int width, hstride, vstride, execsize;
 159
 160    if (reg.file == BRW_IMMEDIATE_VALUE) {
 161       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 162        * mean the destination has to be 128-bit aligned and the
 163        * destination horiz stride has to be a word.
 164        */
 165       if (reg.type == BRW_REGISTER_TYPE_V) {
 166          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 167                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 168       }
 169
 170       return;
 171    }
 172
 173    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 174        reg.file == BRW_ARF_NULL)
 175       return;
 176
 177    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 178    hstride = hstride_for_reg[reg.hstride];
 179
 180    if (reg.vstride == 0xf) {
 181       vstride = -1;
 182    } else {
 183       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 184       vstride = vstride_for_reg[reg.vstride];
 185    }
 186
 187    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 188    width = width_for_reg[reg.width];
 189
 190    assert(insn->header.execution_size >= 0 &&
 191           insn->header.execution_size < Elements(execsize_for_reg));
 192    execsize = execsize_for_reg[insn->header.execution_size];
 193
 194    /* Restrictions from 3.3.10: Register Region Restrictions. */
 195    /* 3. */
 196    assert(execsize >= width);
 197
 198    /* 4. */
 199    if (execsize == width && hstride != 0) {
 200       assert(vstride == -1 || vstride == width * hstride);
 201    }
 202
 203    /* 5. */
 204    if (execsize == width && hstride == 0) {
 205       /* no restriction on vstride. */
 206    }
 207
 208    /* 6. */
 209    if (width == 1) {
 210       assert(hstride == 0);
 211    }
 212
 213    /* 7. */
 214    if (execsize == 1 && width == 1) {
 215       assert(hstride == 0);
 216       assert(vstride == 0);
 217    }
 218
 219    /* 8. */
 220    if (vstride == 0 && hstride == 0) {
 221       assert(width == 1);
 222    }
 223
 224    /* 10. Check destination issues. */
 225 }
 226
 227 void
 228 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 229              struct brw_reg reg)
 230 {
 231    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 232       assert(reg.nr < 128);
 233
 234    gen7_convert_mrf_to_grf(p, &reg);
 235
 236    validate_reg(insn, reg);
 237
 238    insn->bits1.da1.src0_reg_file = reg.file;
 239    insn->bits1.da1.src0_reg_type = reg.type;
 240    insn->bits2.da1.src0_abs = reg.abs;
 241    insn->bits2.da1.src0_negate = reg.negate;
 242    insn->bits2.da1.src0_address_mode = reg.address_mode;
 243
 244    if (reg.file == BRW_IMMEDIATE_VALUE) {
 245       insn->bits3.ud = reg.dw1.ud;
 246
 247       /* Required to set some fields in src1 as well:
 248        */
 249       insn->bits1.da1.src1_reg_file = 0; /* arf */
 250       insn->bits1.da1.src1_reg_type = reg.type;
 251    }
 252    else
 253    {
 254       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 255          if (insn->header.access_mode == BRW_ALIGN_1) {
 256             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 257             insn->bits2.da1.src0_reg_nr = reg.nr;
 258          }
 259          else {
 260             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 261             insn->bits2.da16.src0_reg_nr = reg.nr;
 262          }
 263       }
 264       else {
 265          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 266
 267          if (insn->header.access_mode == BRW_ALIGN_1) {
 268             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 269          }
 270          else {
 271             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 272          }
 273       }
 274
 275       if (insn->header.access_mode == BRW_ALIGN_1) {
 276          if (reg.width == BRW_WIDTH_1 &&
 277              insn->header.execution_size == BRW_EXECUTE_1) {
 278             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 279             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 280             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 281          }
 282          else {
 283             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 284             insn->bits2.da1.src0_width = reg.width;
 285             insn->bits2.da1.src0_vert_stride = reg.vstride;
 286          }
 287       }
 288       else {
 289          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 290          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 291          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 292          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 293
 294          /* This is an oddity of the fact we're using the same
 295           * descriptions for registers in align_16 as align_1:
 296           */
 297          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 298             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 299          else
 300             insn->bits2.da16.src0_vert_stride = reg.vstride;
 301       }
 302    }
 303 }
 304
 305
 306 void brw_set_src1(struct brw_compile *p,
 307                   struct brw_instruction *insn,
 308                   struct brw_reg reg)
 309 {
 310    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 311
 312    assert(reg.nr < 128);
 313
 314    gen7_convert_mrf_to_grf(p, &reg);
 315
 316    validate_reg(insn, reg);
 317
 318    insn->bits1.da1.src1_reg_file = reg.file;
 319    insn->bits1.da1.src1_reg_type = reg.type;
 320    insn->bits3.da1.src1_abs = reg.abs;
 321    insn->bits3.da1.src1_negate = reg.negate;
 322
 323    /* Only src1 can be immediate in two-argument instructions.
 324     */
 325    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 326
 327    if (reg.file == BRW_IMMEDIATE_VALUE) {
 328       insn->bits3.ud = reg.dw1.ud;
 329    }
 330    else {
 331       /* This is a hardware restriction, which may or may not be lifted
 332        * in the future:
 333        */
 334       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 335       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 336
 337       if (insn->header.access_mode == BRW_ALIGN_1) {
 338          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 339          insn->bits3.da1.src1_reg_nr = reg.nr;
 340       }
 341       else {
 342          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 343          insn->bits3.da16.src1_reg_nr = reg.nr;
 344       }
 345
 346       if (insn->header.access_mode == BRW_ALIGN_1) {
 347          if (reg.width == BRW_WIDTH_1 &&
 348              insn->header.execution_size == BRW_EXECUTE_1) {
 349             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 350             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 351             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 352          }
 353          else {
 354             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 355             insn->bits3.da1.src1_width = reg.width;
 356             insn->bits3.da1.src1_vert_stride = reg.vstride;
 357          }
 358       }
 359       else {
 360          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 361          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 362          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 363          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 364
 365          /* This is an oddity of the fact we're using the same
 366           * descriptions for registers in align_16 as align_1:
 367           */
 368          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 369             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 370          else
 371             insn->bits3.da16.src1_vert_stride = reg.vstride;
 372       }
 373    }
 374 }
 375
 376 /**
 377  * Set the Message Descriptor and Extended Message Descriptor fields
 378  * for SEND messages.
 379  *
 380  * \note This zeroes out the Function Control bits, so it must be called
 381  *       \b before filling out any message-specific data.  Callers can
 382  *       choose not to fill in irrelevant bits; they will be zero.
 383  */
 384 static void
 385 brw_set_message_descriptor(struct brw_compile *p,
 386                            struct brw_instruction *inst,
 387                            enum brw_message_target sfid,
 388                            unsigned msg_length,
 389                            unsigned response_length,
 390                            bool header_present,
 391                            bool end_of_thread)
 392 {
 393    struct intel_context *intel = &p->brw->intel;
 394
 395    brw_set_src1(p, inst, brw_imm_d(0));
 396
 397    if (intel->gen >= 5) {
 398       inst->bits3.generic_gen5.header_present = header_present;
 399       inst->bits3.generic_gen5.response_length = response_length;
 400       inst->bits3.generic_gen5.msg_length = msg_length;
 401       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 402
 403       if (intel->gen >= 6) {
 404          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 405          inst->header.destreg__conditionalmod = sfid;
 406       } else {
 407          /* Set Extended Message Descriptor (ex_desc) */
 408          inst->bits2.send_gen5.sfid = sfid;
 409          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 410       }
 411    } else {
 412       inst->bits3.generic.response_length = response_length;
 413       inst->bits3.generic.msg_length = msg_length;
 414       inst->bits3.generic.msg_target = sfid;
 415       inst->bits3.generic.end_of_thread = end_of_thread;
 416    }
 417 }
 418
 419 static void brw_set_math_message( struct brw_compile *p,
 420                                   struct brw_instruction *insn,
 421                                   GLuint function,
 422                                   GLuint integer_type,
 423                                   bool low_precision,
 424                                   bool saturate,
 425                                   GLuint dataType )
 426 {
 427    struct brw_context *brw = p->brw;
 428    struct intel_context *intel = &brw->intel;
 429    unsigned msg_length;
 430    unsigned response_length;
 431
 432    /* Infer message length from the function */
 433    switch (function) {
 434    case BRW_MATH_FUNCTION_POW:
 435    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 436    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 437    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 438       msg_length = 2;
 439       break;
 440    default:
 441       msg_length = 1;
 442       break;
 443    }
 444
 445    /* Infer response length from the function */
 446    switch (function) {
 447    case BRW_MATH_FUNCTION_SINCOS:
 448    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 449       response_length = 2;
 450       break;
 451    default:
 452       response_length = 1;
 453       break;
 454    }
 455
 456    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 457                               msg_length, response_length, false, false);
 458    if (intel->gen == 5) {
 459       insn->bits3.math_gen5.function = function;
 460       insn->bits3.math_gen5.int_type = integer_type;
 461       insn->bits3.math_gen5.precision = low_precision;
 462       insn->bits3.math_gen5.saturate = saturate;
 463       insn->bits3.math_gen5.data_type = dataType;
 464       insn->bits3.math_gen5.snapshot = 0;
 465    } else {
 466       insn->bits3.math.function = function;
 467       insn->bits3.math.int_type = integer_type;
 468       insn->bits3.math.precision = low_precision;
 469       insn->bits3.math.saturate = saturate;
 470       insn->bits3.math.data_type = dataType;
 471    }
 472 }
 473
 474
 475 static void brw_set_ff_sync_message(struct brw_compile *p,
 476                                     struct brw_instruction *insn,
 477                                     bool allocate,
 478                                     GLuint response_length,
 479                                     bool end_of_thread)
 480 {
 481    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 482                               1, response_length, true, end_of_thread);
 483    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 484    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 485    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 486    insn->bits3.urb_gen5.allocate = allocate;
 487    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 488    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 489 }
 490
 491 static void brw_set_urb_message( struct brw_compile *p,
 492                                  struct brw_instruction *insn,
 493                                  bool allocate,
 494                                  bool used,
 495                                  GLuint msg_length,
 496                                  GLuint response_length,
 497                                  bool end_of_thread,
 498                                  bool complete,
 499                                  GLuint offset,
 500                                  GLuint swizzle_control )
 501 {
 502    struct brw_context *brw = p->brw;
 503    struct intel_context *intel = &brw->intel;
 504
 505    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 506                               msg_length, response_length, true, end_of_thread);
 507    if (intel->gen == 7) {
 508       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 509       insn->bits3.urb_gen7.offset = offset;
 510       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 511       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 512       /* per_slot_offset = 0 makes it ignore offsets in message header */
 513       insn->bits3.urb_gen7.per_slot_offset = 0;
 514       insn->bits3.urb_gen7.complete = complete;
 515    } else if (intel->gen >= 5) {
 516       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 517       insn->bits3.urb_gen5.offset = offset;
 518       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 519       insn->bits3.urb_gen5.allocate = allocate;
 520       insn->bits3.urb_gen5.used = used; /* ? */
 521       insn->bits3.urb_gen5.complete = complete;
 522    } else {
 523       insn->bits3.urb.opcode = 0;       /* ? */
 524       insn->bits3.urb.offset = offset;
 525       insn->bits3.urb.swizzle_control = swizzle_control;
 526       insn->bits3.urb.allocate = allocate;
 527       insn->bits3.urb.used = used;      /* ? */
 528       insn->bits3.urb.complete = complete;
 529    }
 530 }
 531
 532 void
 533 brw_set_dp_write_message(struct brw_compile *p,
 534                          struct brw_instruction *insn,
 535                          GLuint binding_table_index,
 536                          GLuint msg_control,
 537                          GLuint msg_type,
 538                          GLuint msg_length,
 539                          bool header_present,
 540                          GLuint last_render_target,
 541                          GLuint response_length,
 542                          GLuint end_of_thread,
 543                          GLuint send_commit_msg)
 544 {
 545    struct brw_context *brw = p->brw;
 546    struct intel_context *intel = &brw->intel;
 547    unsigned sfid;
 548
 549    if (intel->gen >= 7) {
 550       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 551       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 552          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 553       else
 554          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 555    } else if (intel->gen == 6) {
 556       /* Use the render cache for all write messages. */
 557       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 558    } else {
 559       sfid = BRW_SFID_DATAPORT_WRITE;
 560    }
 561
 562    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 563                               header_present, end_of_thread);
 564
 565    if (intel->gen >= 7) {
 566       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 567       insn->bits3.gen7_dp.msg_control = msg_control;
 568       insn->bits3.gen7_dp.last_render_target = last_render_target;
 569       insn->bits3.gen7_dp.msg_type = msg_type;
 570    } else if (intel->gen == 6) {
 571       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 572       insn->bits3.gen6_dp.msg_control = msg_control;
 573       insn->bits3.gen6_dp.last_render_target = last_render_target;
 574       insn->bits3.gen6_dp.msg_type = msg_type;
 575       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 576    } else if (intel->gen == 5) {
 577       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 578       insn->bits3.dp_write_gen5.msg_control = msg_control;
 579       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 580       insn->bits3.dp_write_gen5.msg_type = msg_type;
 581       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 582    } else {
 583       insn->bits3.dp_write.binding_table_index = binding_table_index;
 584       insn->bits3.dp_write.msg_control = msg_control;
 585       insn->bits3.dp_write.last_render_target = last_render_target;
 586       insn->bits3.dp_write.msg_type = msg_type;
 587       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 588    }
 589 }
 590
 591 void
 592 brw_set_dp_read_message(struct brw_compile *p,
 593                         struct brw_instruction *insn,
 594                         GLuint binding_table_index,
 595                         GLuint msg_control,
 596                         GLuint msg_type,
 597                         GLuint target_cache,
 598                         GLuint msg_length,
 599                         GLuint response_length)
 600 {
 601    struct brw_context *brw = p->brw;
 602    struct intel_context *intel = &brw->intel;
 603    unsigned sfid;
 604
 605    if (intel->gen >= 7) {
 606       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 607    } else if (intel->gen == 6) {
 608       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 609          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 610       else
 611          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 612    } else {
 613       sfid = BRW_SFID_DATAPORT_READ;
 614    }
 615
 616    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 617                               true, false);
 618
 619    if (intel->gen >= 7) {
 620       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 621       insn->bits3.gen7_dp.msg_control = msg_control;
 622       insn->bits3.gen7_dp.last_render_target = 0;
 623       insn->bits3.gen7_dp.msg_type = msg_type;
 624    } else if (intel->gen == 6) {
 625       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 626       insn->bits3.gen6_dp.msg_control = msg_control;
 627       insn->bits3.gen6_dp.last_render_target = 0;
 628       insn->bits3.gen6_dp.msg_type = msg_type;
 629       insn->bits3.gen6_dp.send_commit_msg = 0;
 630    } else if (intel->gen == 5) {
 631       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 632       insn->bits3.dp_read_gen5.msg_control = msg_control;
 633       insn->bits3.dp_read_gen5.msg_type = msg_type;
 634       insn->bits3.dp_read_gen5.target_cache = target_cache;
 635    } else if (intel->is_g4x) {
 636       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 637       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 638       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 639       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 640    } else {
 641       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 642       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 643       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 644       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 645    }
 646 }
 647
 648 static void brw_set_sampler_message(struct brw_compile *p,
 649                                     struct brw_instruction *insn,
 650                                     GLuint binding_table_index,
 651                                     GLuint sampler,
 652                                     GLuint msg_type,
 653                                     GLuint response_length,
 654                                     GLuint msg_length,
 655                                     GLuint header_present,
 656                                     GLuint simd_mode,
 657                                     GLuint return_format)
 658 {
 659    struct brw_context *brw = p->brw;
 660    struct intel_context *intel = &brw->intel;
 661
 662    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 663                               response_length, header_present, false);
 664
 665    if (intel->gen >= 7) {
 666       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 667       insn->bits3.sampler_gen7.sampler = sampler;
 668       insn->bits3.sampler_gen7.msg_type = msg_type;
 669       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 670    } else if (intel->gen >= 5) {
 671       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 672       insn->bits3.sampler_gen5.sampler = sampler;
 673       insn->bits3.sampler_gen5.msg_type = msg_type;
 674       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 675    } else if (intel->is_g4x) {
 676       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 677       insn->bits3.sampler_g4x.sampler = sampler;
 678       insn->bits3.sampler_g4x.msg_type = msg_type;
 679    } else {
 680       insn->bits3.sampler.binding_table_index = binding_table_index;
 681       insn->bits3.sampler.sampler = sampler;
 682       insn->bits3.sampler.msg_type = msg_type;
 683       insn->bits3.sampler.return_format = return_format;
 684    }
 685 }
 686
 687
 688 #define next_insn brw_next_insn
 689 struct brw_instruction *
 690 brw_next_insn(struct brw_compile *p, GLuint opcode)
 691 {
 692    struct brw_instruction *insn;
 693
 694    assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
 695
 696    insn = &p->store[p->nr_insn++];
 697    memcpy(insn, p->current, sizeof(*insn));
 698
 699    /* Reset this one-shot flag:
 700     */
 701
 702    if (p->current->header.destreg__conditionalmod) {
 703       p->current->header.destreg__conditionalmod = 0;
 704       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 705    }
 706
 707    insn->header.opcode = opcode;
 708    return insn;
 709 }
 710
 711 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 712                                          GLuint opcode,
 713                                          struct brw_reg dest,
 714                                          struct brw_reg src )
 715 {
 716    struct brw_instruction *insn = next_insn(p, opcode);
 717    brw_set_dest(p, insn, dest);
 718    brw_set_src0(p, insn, src);
 719    return insn;
 720 }
 721
 722 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 723                                         GLuint opcode,
 724                                         struct brw_reg dest,
 725                                         struct brw_reg src0,
 726                                         struct brw_reg src1 )
 727 {
 728    struct brw_instruction *insn = next_insn(p, opcode);
 729    brw_set_dest(p, insn, dest);
 730    brw_set_src0(p, insn, src0);
 731    brw_set_src1(p, insn, src1);
 732    return insn;
 733 }
 734
 735
 736 /***********************************************************************
 737  * Convenience routines.
 738  */
 739 #define ALU1(OP)                                        \
 740 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 741               struct brw_reg dest,                      \
 742               struct brw_reg src0)                      \
 743 {                                                       \
 744    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 745 }
 746
 747 #define ALU2(OP)                                        \
 748 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 749               struct brw_reg dest,                      \
 750               struct brw_reg src0,                      \
 751               struct brw_reg src1)                      \
 752 {                                                       \
 753    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 754 }
 755
 756 /* Rounding operations (other than RNDD) require two instructions - the first
 757  * stores a rounded value (possibly the wrong way) in the dest register, but
 758  * also sets a per-channel "increment bit" in the flag register.  A predicated
 759  * add of 1.0 fixes dest to contain the desired result.
 760  *
 761  * Sandybridge and later appear to round correctly without an ADD.
 762  */
 763 #define ROUND(OP)                                                             \
 764 void brw_##OP(struct brw_compile *p,                                          \
 765               struct brw_reg dest,                                            \
 766               struct brw_reg src)                                             \
 767 {                                                                             \
 768    struct brw_instruction *rnd, *add;                                         \
 769    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 770    brw_set_dest(p, rnd, dest);                                                \
 771    brw_set_src0(p, rnd, src);                                                 \
 772                                                                               \
 773    if (p->brw->intel.gen < 6) {                                               \
 774       /* turn on round-increments */                                          \
 775       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 776       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 777       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 778    }                                                                          \
 779 }
 780
 781
 782 ALU1(MOV)
 783 ALU2(SEL)
 784 ALU1(NOT)
 785 ALU2(AND)
 786 ALU2(OR)
 787 ALU2(XOR)
 788 ALU2(SHR)
 789 ALU2(SHL)
 790 ALU2(RSR)
 791 ALU2(RSL)
 792 ALU2(ASR)
 793 ALU1(FRC)
 794 ALU1(RNDD)
 795 ALU2(MAC)
 796 ALU2(MACH)
 797 ALU1(LZD)
 798 ALU2(DP4)
 799 ALU2(DPH)
 800 ALU2(DP3)
 801 ALU2(DP2)
 802 ALU2(LINE)
 803 ALU2(PLN)
 804
 805
 806 ROUND(RNDZ)
 807 ROUND(RNDE)
 808
 809
 810 struct brw_instruction *brw_ADD(struct brw_compile *p,
 811                                 struct brw_reg dest,
 812                                 struct brw_reg src0,
 813                                 struct brw_reg src1)
 814 {
 815    /* 6.2.2: add */
 816    if (src0.type == BRW_REGISTER_TYPE_F ||
 817        (src0.file == BRW_IMMEDIATE_VALUE &&
 818         src0.type == BRW_REGISTER_TYPE_VF)) {
 819       assert(src1.type != BRW_REGISTER_TYPE_UD);
 820       assert(src1.type != BRW_REGISTER_TYPE_D);
 821    }
 822
 823    if (src1.type == BRW_REGISTER_TYPE_F ||
 824        (src1.file == BRW_IMMEDIATE_VALUE &&
 825         src1.type == BRW_REGISTER_TYPE_VF)) {
 826       assert(src0.type != BRW_REGISTER_TYPE_UD);
 827       assert(src0.type != BRW_REGISTER_TYPE_D);
 828    }
 829
 830    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 831 }
 832
 833 struct brw_instruction *brw_MUL(struct brw_compile *p,
 834                                 struct brw_reg dest,
 835                                 struct brw_reg src0,
 836                                 struct brw_reg src1)
 837 {
 838    /* 6.32.38: mul */
 839    if (src0.type == BRW_REGISTER_TYPE_D ||
 840        src0.type == BRW_REGISTER_TYPE_UD ||
 841        src1.type == BRW_REGISTER_TYPE_D ||
 842        src1.type == BRW_REGISTER_TYPE_UD) {
 843       assert(dest.type != BRW_REGISTER_TYPE_F);
 844    }
 845
 846    if (src0.type == BRW_REGISTER_TYPE_F ||
 847        (src0.file == BRW_IMMEDIATE_VALUE &&
 848         src0.type == BRW_REGISTER_TYPE_VF)) {
 849       assert(src1.type != BRW_REGISTER_TYPE_UD);
 850       assert(src1.type != BRW_REGISTER_TYPE_D);
 851    }
 852
 853    if (src1.type == BRW_REGISTER_TYPE_F ||
 854        (src1.file == BRW_IMMEDIATE_VALUE &&
 855         src1.type == BRW_REGISTER_TYPE_VF)) {
 856       assert(src0.type != BRW_REGISTER_TYPE_UD);
 857       assert(src0.type != BRW_REGISTER_TYPE_D);
 858    }
 859
 860    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 861           src0.nr != BRW_ARF_ACCUMULATOR);
 862    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 863           src1.nr != BRW_ARF_ACCUMULATOR);
 864
 865    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
 866 }
 867
 868
 869 void brw_NOP(struct brw_compile *p)
 870 {
 871    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
 872    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 873    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 874    brw_set_src1(p, insn, brw_imm_ud(0x0));
 875 }
 876
 877
 878
 879
 880
 881 /***********************************************************************
 882  * Comparisons, if/else/endif
 883  */
 884
 885 struct brw_instruction *brw_JMPI(struct brw_compile *p,
 886                                  struct brw_reg dest,
 887                                  struct brw_reg src0,
 888                                  struct brw_reg src1)
 889 {
 890    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 891
 892    insn->header.execution_size = 1;
 893    insn->header.compression_control = BRW_COMPRESSION_NONE;
 894    insn->header.mask_control = BRW_MASK_DISABLE;
 895
 896    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 897
 898    return insn;
 899 }
 900
 901 static void
 902 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
 903 {
 904    p->if_stack[p->if_stack_depth] = inst - p->store;
 905
 906    p->if_stack_depth++;
 907    if (p->if_stack_array_size <= p->if_stack_depth) {
 908       p->if_stack_array_size *= 2;
 909       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
 910                              p->if_stack_array_size);
 911    }
 912 }
 913
 914 static struct brw_instruction *
 915 pop_if_stack(struct brw_compile *p)
 916 {
 917    p->if_stack_depth--;
 918    return &p->store[p->if_stack[p->if_stack_depth]];
 919 }
 920
 921 static void
 922 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
 923 {
 924    if (p->loop_stack_array_size < p->loop_stack_depth) {
 925       p->loop_stack_array_size *= 2;
 926       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
 927                                p->loop_stack_array_size);
 928       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
 929                                      p->loop_stack_array_size);
 930    }
 931
 932    p->loop_stack[p->loop_stack_depth] = inst - p->store;
 933    p->loop_stack_depth++;
 934    p->if_depth_in_loop[p->loop_stack_depth] = 0;
 935 }
 936
 937 static struct brw_instruction *
 938 get_inner_do_insn(struct brw_compile *p)
 939 {
 940    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
 941 }
 942
 943 /* EU takes the value from the flag register and pushes it onto some
 944  * sort of a stack (presumably merging with any flag value already on
 945  * the stack).  Within an if block, the flags at the top of the stack
 946  * control execution on each channel of the unit, eg. on each of the
 947  * 16 pixel values in our wm programs.
 948  *
 949  * When the matching 'else' instruction is reached (presumably by
 950  * countdown of the instruction count patched in by our ELSE/ENDIF
 951  * functions), the relevent flags are inverted.
 952  *
 953  * When the matching 'endif' instruction is reached, the flags are
 954  * popped off.  If the stack is now empty, normal execution resumes.
 955  */
 956 struct brw_instruction *
 957 brw_IF(struct brw_compile *p, GLuint execute_size)
 958 {
 959    struct intel_context *intel = &p->brw->intel;
 960    struct brw_instruction *insn;
 961
 962    insn = next_insn(p, BRW_OPCODE_IF);
 963
 964    /* Override the defaults for this instruction:
 965     */
 966    if (intel->gen < 6) {
 967       brw_set_dest(p, insn, brw_ip_reg());
 968       brw_set_src0(p, insn, brw_ip_reg());
 969       brw_set_src1(p, insn, brw_imm_d(0x0));
 970    } else if (intel->gen == 6) {
 971       brw_set_dest(p, insn, brw_imm_w(0));
 972       insn->bits1.branch_gen6.jump_count = 0;
 973       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 974       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 975    } else {
 976       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 977       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 978       brw_set_src1(p, insn, brw_imm_ud(0));
 979       insn->bits3.break_cont.jip = 0;
 980       insn->bits3.break_cont.uip = 0;
 981    }
 982
 983    insn->header.execution_size = execute_size;
 984    insn->header.compression_control = BRW_COMPRESSION_NONE;
 985    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
 986    insn->header.mask_control = BRW_MASK_ENABLE;
 987    if (!p->single_program_flow)
 988       insn->header.thread_control = BRW_THREAD_SWITCH;
 989
 990    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 991
 992    push_if_stack(p, insn);
 993    p->if_depth_in_loop[p->loop_stack_depth]++;
 994    return insn;
 995 }
 996
 997 /* This function is only used for gen6-style IF instructions with an
 998  * embedded comparison (conditional modifier).  It is not used on gen7.
 999  */
1000 struct brw_instruction *
1001 gen6_IF(struct brw_compile *p, uint32_t conditional,
1002         struct brw_reg src0, struct brw_reg src1)
1003 {
1004    struct brw_instruction *insn;
1005
1006    insn = next_insn(p, BRW_OPCODE_IF);
1007
1008    brw_set_dest(p, insn, brw_imm_w(0));
1009    if (p->compressed) {
1010       insn->header.execution_size = BRW_EXECUTE_16;
1011    } else {
1012       insn->header.execution_size = BRW_EXECUTE_8;
1013    }
1014    insn->bits1.branch_gen6.jump_count = 0;
1015    brw_set_src0(p, insn, src0);
1016    brw_set_src1(p, insn, src1);
1017
1018    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1019    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1020    insn->header.destreg__conditionalmod = conditional;
1021
1022    if (!p->single_program_flow)
1023       insn->header.thread_control = BRW_THREAD_SWITCH;
1024
1025    push_if_stack(p, insn);
1026    return insn;
1027 }
1028
1029 /**
1030  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1031  */
1032 static void
1033 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1034                        struct brw_instruction *if_inst,
1035                        struct brw_instruction *else_inst)
1036 {
1037    /* The next instruction (where the ENDIF would be, if it existed) */
1038    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1039
1040    assert(p->single_program_flow);
1041    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1042    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1043    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1044
1045    /* Convert IF to an ADD instruction that moves the instruction pointer
1046     * to the first instruction of the ELSE block.  If there is no ELSE
1047     * block, point to where ENDIF would be.  Reverse the predicate.
1048     *
1049     * There's no need to execute an ENDIF since we don't need to do any
1050     * stack operations, and if we're currently executing, we just want to
1051     * continue normally.
1052     */
1053    if_inst->header.opcode = BRW_OPCODE_ADD;
1054    if_inst->header.predicate_inverse = 1;
1055
1056    if (else_inst != NULL) {
1057       /* Convert ELSE to an ADD instruction that points where the ENDIF
1058        * would be.
1059        */
1060       else_inst->header.opcode = BRW_OPCODE_ADD;
1061
1062       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1063       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1064    } else {
1065       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1066    }
1067 }
1068
1069 /**
1070  * Patch IF and ELSE instructions with appropriate jump targets.
1071  */
1072 static void
1073 patch_IF_ELSE(struct brw_compile *p,
1074               struct brw_instruction *if_inst,
1075               struct brw_instruction *else_inst,
1076               struct brw_instruction *endif_inst)
1077 {
1078    struct intel_context *intel = &p->brw->intel;
1079
1080    /* We shouldn't be patching IF and ELSE instructions in single program flow
1081     * mode when gen < 6, because in single program flow mode on those
1082     * platforms, we convert flow control instructions to conditional ADDs that
1083     * operate on IP (see brw_ENDIF).
1084     *
1085     * However, on Gen6, writing to IP doesn't work in single program flow mode
1086     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1087     * not be updated by non-flow control instructions.").  And on later
1088     * platforms, there is no significant benefit to converting control flow
1089     * instructions to conditional ADDs.  So we do patch IF and ELSE
1090     * instructions in single program flow mode on those platforms.
1091     */
1092    if (intel->gen < 6)
1093       assert(!p->single_program_flow);
1094
1095    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1096    assert(endif_inst != NULL);
1097    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1098
1099    unsigned br = 1;
1100    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1101     * requires 2 chunks.
1102     */
1103    if (intel->gen >= 5)
1104       br = 2;
1105
1106    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1107    endif_inst->header.execution_size = if_inst->header.execution_size;
1108
1109    if (else_inst == NULL) {
1110       /* Patch IF -> ENDIF */
1111       if (intel->gen < 6) {
1112          /* Turn it into an IFF, which means no mask stack operations for
1113           * all-false and jumping past the ENDIF.
1114           */
1115          if_inst->header.opcode = BRW_OPCODE_IFF;
1116          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1117          if_inst->bits3.if_else.pop_count = 0;
1118          if_inst->bits3.if_else.pad0 = 0;
1119       } else if (intel->gen == 6) {
1120          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1121          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1122       } else {
1123          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1124          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1125       }
1126    } else {
1127       else_inst->header.execution_size = if_inst->header.execution_size;
1128
1129       /* Patch IF -> ELSE */
1130       if (intel->gen < 6) {
1131          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1132          if_inst->bits3.if_else.pop_count = 0;
1133          if_inst->bits3.if_else.pad0 = 0;
1134       } else if (intel->gen == 6) {
1135          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1136       }
1137
1138       /* Patch ELSE -> ENDIF */
1139       if (intel->gen < 6) {
1140          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1141           * matching ENDIF.
1142           */
1143          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1144          else_inst->bits3.if_else.pop_count = 1;
1145          else_inst->bits3.if_else.pad0 = 0;
1146       } else if (intel->gen == 6) {
1147          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1148          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1149       } else {
1150          /* The IF instruction's JIP should point just past the ELSE */
1151          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1152          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1153          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1154          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1155       }
1156    }
1157 }
1158
1159 void
1160 brw_ELSE(struct brw_compile *p)
1161 {
1162    struct intel_context *intel = &p->brw->intel;
1163    struct brw_instruction *insn;
1164
1165    insn = next_insn(p, BRW_OPCODE_ELSE);
1166
1167    if (intel->gen < 6) {
1168       brw_set_dest(p, insn, brw_ip_reg());
1169       brw_set_src0(p, insn, brw_ip_reg());
1170       brw_set_src1(p, insn, brw_imm_d(0x0));
1171    } else if (intel->gen == 6) {
1172       brw_set_dest(p, insn, brw_imm_w(0));
1173       insn->bits1.branch_gen6.jump_count = 0;
1174       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1175       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1176    } else {
1177       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1178       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1179       brw_set_src1(p, insn, brw_imm_ud(0));
1180       insn->bits3.break_cont.jip = 0;
1181       insn->bits3.break_cont.uip = 0;
1182    }
1183
1184    insn->header.compression_control = BRW_COMPRESSION_NONE;
1185    insn->header.mask_control = BRW_MASK_ENABLE;
1186    if (!p->single_program_flow)
1187       insn->header.thread_control = BRW_THREAD_SWITCH;
1188
1189    push_if_stack(p, insn);
1190 }
1191
1192 void
1193 brw_ENDIF(struct brw_compile *p)
1194 {
1195    struct intel_context *intel = &p->brw->intel;
1196    struct brw_instruction *insn;
1197    struct brw_instruction *else_inst = NULL;
1198    struct brw_instruction *if_inst = NULL;
1199    struct brw_instruction *tmp;
1200    bool emit_endif = true;
1201
1202    /* In single program flow mode, we can express IF and ELSE instructions
1203     * equivalently as ADD instructions that operate on IP.  On platforms prior
1204     * to Gen6, flow control instructions cause an implied thread switch, so
1205     * this is a significant savings.
1206     *
1207     * However, on Gen6, writing to IP doesn't work in single program flow mode
1208     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1209     * not be updated by non-flow control instructions.").  And on later
1210     * platforms, there is no significant benefit to converting control flow
1211     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1212     * Gen5.
1213     */
1214    if (intel->gen < 6 && p->single_program_flow)
1215       emit_endif = false;
1216
1217    /*
1218     * A single next_insn() may change the base adress of instruction store
1219     * memory(p->store), so call it first before referencing the instruction
1220     * store pointer from an index
1221     */
1222    if (emit_endif)
1223       insn = next_insn(p, BRW_OPCODE_ENDIF);
1224
1225    /* Pop the IF and (optional) ELSE instructions from the stack */
1226    p->if_depth_in_loop[p->loop_stack_depth]--;
1227    tmp = pop_if_stack(p);
1228    if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1229       else_inst = tmp;
1230       tmp = pop_if_stack(p);
1231    }
1232    if_inst = tmp;
1233
1234    if (!emit_endif) {
1235       /* ENDIF is useless; don't bother emitting it. */
1236       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1237       return;
1238    }
1239
1240    if (intel->gen < 6) {
1241       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1242       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1243       brw_set_src1(p, insn, brw_imm_d(0x0));
1244    } else if (intel->gen == 6) {
1245       brw_set_dest(p, insn, brw_imm_w(0));
1246       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1247       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1248    } else {
1249       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1250       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1251       brw_set_src1(p, insn, brw_imm_ud(0));
1252    }
1253
1254    insn->header.compression_control = BRW_COMPRESSION_NONE;
1255    insn->header.mask_control = BRW_MASK_ENABLE;
1256    insn->header.thread_control = BRW_THREAD_SWITCH;
1257
1258    /* Also pop item off the stack in the endif instruction: */
1259    if (intel->gen < 6) {
1260       insn->bits3.if_else.jump_count = 0;
1261       insn->bits3.if_else.pop_count = 1;
1262       insn->bits3.if_else.pad0 = 0;
1263    } else if (intel->gen == 6) {
1264       insn->bits1.branch_gen6.jump_count = 2;
1265    } else {
1266       insn->bits3.break_cont.jip = 2;
1267    }
1268    patch_IF_ELSE(p, if_inst, else_inst, insn);
1269 }
1270
1271 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1272 {
1273    struct intel_context *intel = &p->brw->intel;
1274    struct brw_instruction *insn;
1275
1276    insn = next_insn(p, BRW_OPCODE_BREAK);
1277    if (intel->gen >= 6) {
1278       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1279       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1280       brw_set_src1(p, insn, brw_imm_d(0x0));
1281    } else {
1282       brw_set_dest(p, insn, brw_ip_reg());
1283       brw_set_src0(p, insn, brw_ip_reg());
1284       brw_set_src1(p, insn, brw_imm_d(0x0));
1285       insn->bits3.if_else.pad0 = 0;
1286       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1287    }
1288    insn->header.compression_control = BRW_COMPRESSION_NONE;
1289    insn->header.execution_size = BRW_EXECUTE_8;
1290
1291    return insn;
1292 }
1293
1294 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1295 {
1296    struct brw_instruction *insn;
1297
1298    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1299    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1300    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1301    brw_set_dest(p, insn, brw_ip_reg());
1302    brw_set_src0(p, insn, brw_ip_reg());
1303    brw_set_src1(p, insn, brw_imm_d(0x0));
1304
1305    insn->header.compression_control = BRW_COMPRESSION_NONE;
1306    insn->header.execution_size = BRW_EXECUTE_8;
1307    return insn;
1308 }
1309
1310 struct brw_instruction *brw_CONT(struct brw_compile *p)
1311 {
1312    struct brw_instruction *insn;
1313    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1314    brw_set_dest(p, insn, brw_ip_reg());
1315    brw_set_src0(p, insn, brw_ip_reg());
1316    brw_set_src1(p, insn, brw_imm_d(0x0));
1317    insn->header.compression_control = BRW_COMPRESSION_NONE;
1318    insn->header.execution_size = BRW_EXECUTE_8;
1319    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1320    insn->bits3.if_else.pad0 = 0;
1321    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1322    return insn;
1323 }
1324
1325 /* DO/WHILE loop:
1326  *
1327  * The DO/WHILE is just an unterminated loop -- break or continue are
1328  * used for control within the loop.  We have a few ways they can be
1329  * done.
1330  *
1331  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1332  * jip and no DO instruction.
1333  *
1334  * For non-uniform control flow pre-gen6, there's a DO instruction to
1335  * push the mask, and a WHILE to jump back, and BREAK to get out and
1336  * pop the mask.
1337  *
1338  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1339  * just points back to the first instruction of the loop.
1340  */
1341 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1342 {
1343    struct intel_context *intel = &p->brw->intel;
1344
1345    if (intel->gen >= 6 || p->single_program_flow) {
1346       push_loop_stack(p, &p->store[p->nr_insn]);
1347       return &p->store[p->nr_insn];
1348    } else {
1349       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1350
1351       push_loop_stack(p, insn);
1352
1353       /* Override the defaults for this instruction:
1354        */
1355       brw_set_dest(p, insn, brw_null_reg());
1356       brw_set_src0(p, insn, brw_null_reg());
1357       brw_set_src1(p, insn, brw_null_reg());
1358
1359       insn->header.compression_control = BRW_COMPRESSION_NONE;
1360       insn->header.execution_size = execute_size;
1361       insn->header.predicate_control = BRW_PREDICATE_NONE;
1362       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1363       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1364
1365       return insn;
1366    }
1367 }
1368
1369 /**
1370  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1371  * instruction here.
1372  *
1373  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1374  * nesting, since it can always just point to the end of the block/current loop.
1375  */
1376 static void
1377 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1378 {
1379    struct intel_context *intel = &p->brw->intel;
1380    struct brw_instruction *do_inst = get_inner_do_insn(p);
1381    struct brw_instruction *inst;
1382    int br = (intel->gen == 5) ? 2 : 1;
1383
1384    for (inst = while_inst - 1; inst != do_inst; inst--) {
1385       /* If the jump count is != 0, that means that this instruction has already
1386        * been patched because it's part of a loop inside of the one we're
1387        * patching.
1388        */
1389       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1390           inst->bits3.if_else.jump_count == 0) {
1391          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1392       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1393                  inst->bits3.if_else.jump_count == 0) {
1394          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1395       }
1396    }
1397 }
1398
1399 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1400 {
1401    struct intel_context *intel = &p->brw->intel;
1402    struct brw_instruction *insn, *do_insn;
1403    GLuint br = 1;
1404
1405    if (intel->gen >= 5)
1406       br = 2;
1407
1408    if (intel->gen >= 7) {
1409       insn = next_insn(p, BRW_OPCODE_WHILE);
1410       do_insn = get_inner_do_insn(p);
1411
1412       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1413       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1414       brw_set_src1(p, insn, brw_imm_ud(0));
1415       insn->bits3.break_cont.jip = br * (do_insn - insn);
1416
1417       insn->header.execution_size = BRW_EXECUTE_8;
1418    } else if (intel->gen == 6) {
1419       insn = next_insn(p, BRW_OPCODE_WHILE);
1420       do_insn = get_inner_do_insn(p);
1421
1422       brw_set_dest(p, insn, brw_imm_w(0));
1423       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1424       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1425       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1426
1427       insn->header.execution_size = BRW_EXECUTE_8;
1428    } else {
1429       if (p->single_program_flow) {
1430          insn = next_insn(p, BRW_OPCODE_ADD);
1431          do_insn = get_inner_do_insn(p);
1432
1433          brw_set_dest(p, insn, brw_ip_reg());
1434          brw_set_src0(p, insn, brw_ip_reg());
1435          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1436          insn->header.execution_size = BRW_EXECUTE_1;
1437       } else {
1438          insn = next_insn(p, BRW_OPCODE_WHILE);
1439          do_insn = get_inner_do_insn(p);
1440
1441          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1442
1443          brw_set_dest(p, insn, brw_ip_reg());
1444          brw_set_src0(p, insn, brw_ip_reg());
1445          brw_set_src1(p, insn, brw_imm_d(0));
1446
1447          insn->header.execution_size = do_insn->header.execution_size;
1448          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1449          insn->bits3.if_else.pop_count = 0;
1450          insn->bits3.if_else.pad0 = 0;
1451
1452          brw_patch_break_cont(p, insn);
1453       }
1454    }
1455    insn->header.compression_control = BRW_COMPRESSION_NONE;
1456    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1457
1458    p->loop_stack_depth--;
1459
1460    return insn;
1461 }
1462
1463
1464 /* FORWARD JUMPS:
1465  */
1466 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1467 {
1468    struct intel_context *intel = &p->brw->intel;
1469    struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1470    GLuint jmpi = 1;
1471
1472    if (intel->gen >= 5)
1473       jmpi = 2;
1474
1475    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1476    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1477
1478    jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1479 }
1480
1481
1482
1483 /* To integrate with the above, it makes sense that the comparison
1484  * instruction should populate the flag register.  It might be simpler
1485  * just to use the flag reg for most WM tasks?
1486  */
1487 void brw_CMP(struct brw_compile *p,
1488              struct brw_reg dest,
1489              GLuint conditional,
1490              struct brw_reg src0,
1491              struct brw_reg src1)
1492 {
1493    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1494
1495    insn->header.destreg__conditionalmod = conditional;
1496    brw_set_dest(p, insn, dest);
1497    brw_set_src0(p, insn, src0);
1498    brw_set_src1(p, insn, src1);
1499
1500 /*    guess_execution_size(insn, src0); */
1501
1502
1503    /* Make it so that future instructions will use the computed flag
1504     * value until brw_set_predicate_control_flag_value() is called
1505     * again.
1506     */
1507    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1508        dest.nr == 0) {
1509       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1510       p->flag_value = 0xff;
1511    }
1512 }
1513
1514 /* Issue 'wait' instruction for n1, host could program MMIO
1515    to wake up thread. */
1516 void brw_WAIT (struct brw_compile *p)
1517 {
1518    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1519    struct brw_reg src = brw_notification_1_reg();
1520
1521    brw_set_dest(p, insn, src);
1522    brw_set_src0(p, insn, src);
1523    brw_set_src1(p, insn, brw_null_reg());
1524    insn->header.execution_size = 0; /* must */
1525    insn->header.predicate_control = 0;
1526    insn->header.compression_control = 0;
1527 }
1528
1529
1530 /***********************************************************************
1531  * Helpers for the various SEND message types:
1532  */
1533
1534 /** Extended math function, float[8].
1535  */
1536 void brw_math( struct brw_compile *p,
1537                struct brw_reg dest,
1538                GLuint function,
1539                GLuint saturate,
1540                GLuint msg_reg_nr,
1541                struct brw_reg src,
1542                GLuint data_type,
1543                GLuint precision )
1544 {
1545    struct intel_context *intel = &p->brw->intel;
1546
1547    if (intel->gen >= 6) {
1548       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1549
1550       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1551       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1552
1553       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1554       if (intel->gen == 6)
1555          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1556
1557       /* Source modifiers are ignored for extended math instructions on Gen6. */
1558       if (intel->gen == 6) {
1559          assert(!src.negate);
1560          assert(!src.abs);
1561       }
1562
1563       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1564           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1565           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1566          assert(src.type != BRW_REGISTER_TYPE_F);
1567       } else {
1568          assert(src.type == BRW_REGISTER_TYPE_F);
1569       }
1570
1571       /* Math is the same ISA format as other opcodes, except that CondModifier
1572        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1573        */
1574       insn->header.destreg__conditionalmod = function;
1575       insn->header.saturate = saturate;
1576
1577       brw_set_dest(p, insn, dest);
1578       brw_set_src0(p, insn, src);
1579       brw_set_src1(p, insn, brw_null_reg());
1580    } else {
1581       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1582
1583       /* Example code doesn't set predicate_control for send
1584        * instructions.
1585        */
1586       insn->header.predicate_control = 0;
1587       insn->header.destreg__conditionalmod = msg_reg_nr;
1588
1589       brw_set_dest(p, insn, dest);
1590       brw_set_src0(p, insn, src);
1591       brw_set_math_message(p,
1592                            insn,
1593                            function,
1594                            src.type == BRW_REGISTER_TYPE_D,
1595                            precision,
1596                            saturate,
1597                            data_type);
1598    }
1599 }
1600
1601 /** Extended math function, float[8].
1602  */
1603 void brw_math2(struct brw_compile *p,
1604                struct brw_reg dest,
1605                GLuint function,
1606                struct brw_reg src0,
1607                struct brw_reg src1)
1608 {
1609    struct intel_context *intel = &p->brw->intel;
1610    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1611
1612    assert(intel->gen >= 6);
1613    (void) intel;
1614
1615
1616    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1617    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1618    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1619
1620    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1621    if (intel->gen == 6) {
1622       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1623       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1624    }
1625
1626    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1627        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1628        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1629       assert(src0.type != BRW_REGISTER_TYPE_F);
1630       assert(src1.type != BRW_REGISTER_TYPE_F);
1631    } else {
1632       assert(src0.type == BRW_REGISTER_TYPE_F);
1633       assert(src1.type == BRW_REGISTER_TYPE_F);
1634    }
1635
1636    /* Source modifiers are ignored for extended math instructions on Gen6. */
1637    if (intel->gen == 6) {
1638       assert(!src0.negate);
1639       assert(!src0.abs);
1640       assert(!src1.negate);
1641       assert(!src1.abs);
1642    }
1643
1644    /* Math is the same ISA format as other opcodes, except that CondModifier
1645     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1646     */
1647    insn->header.destreg__conditionalmod = function;
1648
1649    brw_set_dest(p, insn, dest);
1650    brw_set_src0(p, insn, src0);
1651    brw_set_src1(p, insn, src1);
1652 }
1653
1654 /**
1655  * Extended math function, float[16].
1656  * Use 2 send instructions.
1657  */
1658 void brw_math_16( struct brw_compile *p,
1659                   struct brw_reg dest,
1660                   GLuint function,
1661                   GLuint saturate,
1662                   GLuint msg_reg_nr,
1663                   struct brw_reg src,
1664                   GLuint precision )
1665 {
1666    struct intel_context *intel = &p->brw->intel;
1667    struct brw_instruction *insn;
1668
1669    if (intel->gen >= 6) {
1670       insn = next_insn(p, BRW_OPCODE_MATH);
1671
1672       /* Math is the same ISA format as other opcodes, except that CondModifier
1673        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1674        */
1675       insn->header.destreg__conditionalmod = function;
1676       insn->header.saturate = saturate;
1677
1678       /* Source modifiers are ignored for extended math instructions. */
1679       assert(!src.negate);
1680       assert(!src.abs);
1681
1682       brw_set_dest(p, insn, dest);
1683       brw_set_src0(p, insn, src);
1684       brw_set_src1(p, insn, brw_null_reg());
1685       return;
1686    }
1687
1688    /* First instruction:
1689     */
1690    brw_push_insn_state(p);
1691    brw_set_predicate_control_flag_value(p, 0xff);
1692    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1693
1694    insn = next_insn(p, BRW_OPCODE_SEND);
1695    insn->header.destreg__conditionalmod = msg_reg_nr;
1696
1697    brw_set_dest(p, insn, dest);
1698    brw_set_src0(p, insn, src);
1699    brw_set_math_message(p,
1700                         insn,
1701                         function,
1702                         BRW_MATH_INTEGER_UNSIGNED,
1703                         precision,
1704                         saturate,
1705                         BRW_MATH_DATA_VECTOR);
1706
1707    /* Second instruction:
1708     */
1709    insn = next_insn(p, BRW_OPCODE_SEND);
1710    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1711    insn->header.destreg__conditionalmod = msg_reg_nr+1;
1712
1713    brw_set_dest(p, insn, offset(dest,1));
1714    brw_set_src0(p, insn, src);
1715    brw_set_math_message(p,
1716                         insn,
1717                         function,
1718                         BRW_MATH_INTEGER_UNSIGNED,
1719                         precision,
1720                         saturate,
1721                         BRW_MATH_DATA_VECTOR);
1722
1723    brw_pop_insn_state(p);
1724 }
1725
1726
1727 /**
1728  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1729  * using a constant offset per channel.
1730  *
1731  * The offset must be aligned to oword size (16 bytes).  Used for
1732  * register spilling.
1733  */
1734 void brw_oword_block_write_scratch(struct brw_compile *p,
1735                                    struct brw_reg mrf,
1736                                    int num_regs,
1737                                    GLuint offset)
1738 {
1739    struct intel_context *intel = &p->brw->intel;
1740    uint32_t msg_control, msg_type;
1741    int mlen;
1742
1743    if (intel->gen >= 6)
1744       offset /= 16;
1745
1746    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1747
1748    if (num_regs == 1) {
1749       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1750       mlen = 2;
1751    } else {
1752       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1753       mlen = 3;
1754    }
1755
1756    /* Set up the message header.  This is g0, with g0.2 filled with
1757     * the offset.  We don't want to leave our offset around in g0 or
1758     * it'll screw up texture samples, so set it up inside the message
1759     * reg.
1760     */
1761    {
1762       brw_push_insn_state(p);
1763       brw_set_mask_control(p, BRW_MASK_DISABLE);
1764       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1765
1766       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1767
1768       /* set message header global offset field (reg 0, element 2) */
1769       brw_MOV(p,
1770               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1771                                   mrf.nr,
1772                                   2), BRW_REGISTER_TYPE_UD),
1773               brw_imm_ud(offset));
1774
1775       brw_pop_insn_state(p);
1776    }
1777
1778    {
1779       struct brw_reg dest;
1780       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1781       int send_commit_msg;
1782       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1783                                          BRW_REGISTER_TYPE_UW);
1784
1785       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1786          insn->header.compression_control = BRW_COMPRESSION_NONE;
1787          src_header = vec16(src_header);
1788       }
1789       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1790       insn->header.destreg__conditionalmod = mrf.nr;
1791
1792       /* Until gen6, writes followed by reads from the same location
1793        * are not guaranteed to be ordered unless write_commit is set.
1794        * If set, then a no-op write is issued to the destination
1795        * register to set a dependency, and a read from the destination
1796        * can be used to ensure the ordering.
1797        *
1798        * For gen6, only writes between different threads need ordering
1799        * protection.  Our use of DP writes is all about register
1800        * spilling within a thread.
1801        */
1802       if (intel->gen >= 6) {
1803          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1804          send_commit_msg = 0;
1805       } else {
1806          dest = src_header;
1807          send_commit_msg = 1;
1808       }
1809
1810       brw_set_dest(p, insn, dest);
1811       if (intel->gen >= 6) {
1812          brw_set_src0(p, insn, mrf);
1813       } else {
1814          brw_set_src0(p, insn, brw_null_reg());
1815       }
1816
1817       if (intel->gen >= 6)
1818          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1819       else
1820          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1821
1822       brw_set_dp_write_message(p,
1823                                insn,
1824                                255, /* binding table index (255=stateless) */
1825                                msg_control,
1826                                msg_type,
1827                                mlen,
1828                                true, /* header_present */
1829                                0, /* not a render target */
1830                                send_commit_msg, /* response_length */
1831                                0, /* eot */
1832                                send_commit_msg);
1833    }
1834 }
1835
1836
1837 /**
1838  * Read a block of owords (half a GRF each) from the scratch buffer
1839  * using a constant index per channel.
1840  *
1841  * Offset must be aligned to oword size (16 bytes).  Used for register
1842  * spilling.
1843  */
1844 void
1845 brw_oword_block_read_scratch(struct brw_compile *p,
1846                              struct brw_reg dest,
1847                              struct brw_reg mrf,
1848                              int num_regs,
1849                              GLuint offset)
1850 {
1851    struct intel_context *intel = &p->brw->intel;
1852    uint32_t msg_control;
1853    int rlen;
1854
1855    if (intel->gen >= 6)
1856       offset /= 16;
1857
1858    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1859    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1860
1861    if (num_regs == 1) {
1862       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1863       rlen = 1;
1864    } else {
1865       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1866       rlen = 2;
1867    }
1868
1869    {
1870       brw_push_insn_state(p);
1871       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1872       brw_set_mask_control(p, BRW_MASK_DISABLE);
1873
1874       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1875
1876       /* set message header global offset field (reg 0, element 2) */
1877       brw_MOV(p,
1878               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1879                                   mrf.nr,
1880                                   2), BRW_REGISTER_TYPE_UD),
1881               brw_imm_ud(offset));
1882
1883       brw_pop_insn_state(p);
1884    }
1885
1886    {
1887       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1888
1889       assert(insn->header.predicate_control == 0);
1890       insn->header.compression_control = BRW_COMPRESSION_NONE;
1891       insn->header.destreg__conditionalmod = mrf.nr;
1892
1893       brw_set_dest(p, insn, dest);      /* UW? */
1894       if (intel->gen >= 6) {
1895          brw_set_src0(p, insn, mrf);
1896       } else {
1897          brw_set_src0(p, insn, brw_null_reg());
1898       }
1899
1900       brw_set_dp_read_message(p,
1901                               insn,
1902                               255, /* binding table index (255=stateless) */
1903                               msg_control,
1904                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1905                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1906                               1, /* msg_length */
1907                               rlen);
1908    }
1909 }
1910
1911 /**
1912  * Read a float[4] vector from the data port Data Cache (const buffer).
1913  * Location (in buffer) should be a multiple of 16.
1914  * Used for fetching shader constants.
1915  */
1916 void brw_oword_block_read(struct brw_compile *p,
1917                           struct brw_reg dest,
1918                           struct brw_reg mrf,
1919                           uint32_t offset,
1920                           uint32_t bind_table_index)
1921 {
1922    struct intel_context *intel = &p->brw->intel;
1923
1924    /* On newer hardware, offset is in units of owords. */
1925    if (intel->gen >= 6)
1926       offset /= 16;
1927
1928    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1929
1930    brw_push_insn_state(p);
1931    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1932    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1933    brw_set_mask_control(p, BRW_MASK_DISABLE);
1934
1935    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1936
1937    /* set message header global offset field (reg 0, element 2) */
1938    brw_MOV(p,
1939            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1940                                mrf.nr,
1941                                2), BRW_REGISTER_TYPE_UD),
1942            brw_imm_ud(offset));
1943
1944    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1945    insn->header.destreg__conditionalmod = mrf.nr;
1946
1947    /* cast dest to a uword[8] vector */
1948    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1949
1950    brw_set_dest(p, insn, dest);
1951    if (intel->gen >= 6) {
1952       brw_set_src0(p, insn, mrf);
1953    } else {
1954       brw_set_src0(p, insn, brw_null_reg());
1955    }
1956
1957    brw_set_dp_read_message(p,
1958                            insn,
1959                            bind_table_index,
1960                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1961                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1962                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1963                            1, /* msg_length */
1964                            1); /* response_length (1 reg, 2 owords!) */
1965
1966    brw_pop_insn_state(p);
1967 }
1968
1969 /**
1970  * Read a set of dwords from the data port Data Cache (const buffer).
1971  *
1972  * Location (in buffer) appears as UD offsets in the register after
1973  * the provided mrf header reg.
1974  */
1975 void brw_dword_scattered_read(struct brw_compile *p,
1976                               struct brw_reg dest,
1977                               struct brw_reg mrf,
1978                               uint32_t bind_table_index)
1979 {
1980    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1981
1982    brw_push_insn_state(p);
1983    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1984    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1985    brw_set_mask_control(p, BRW_MASK_DISABLE);
1986    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1987    brw_pop_insn_state(p);
1988
1989    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1990    insn->header.destreg__conditionalmod = mrf.nr;
1991
1992    /* cast dest to a uword[8] vector */
1993    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1994
1995    brw_set_dest(p, insn, dest);
1996    brw_set_src0(p, insn, brw_null_reg());
1997
1998    brw_set_dp_read_message(p,
1999                            insn,
2000                            bind_table_index,
2001                            BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2002                            BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2003                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2004                            2, /* msg_length */
2005                            1); /* response_length */
2006 }
2007
2008
2009
2010 /**
2011  * Read float[4] constant(s) from VS constant buffer.
2012  * For relative addressing, two float[4] constants will be read into 'dest'.
2013  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2014  */
2015 void brw_dp_READ_4_vs(struct brw_compile *p,
2016                       struct brw_reg dest,
2017                       GLuint location,
2018                       GLuint bind_table_index)
2019 {
2020    struct intel_context *intel = &p->brw->intel;
2021    struct brw_instruction *insn;
2022    GLuint msg_reg_nr = 1;
2023
2024    if (intel->gen >= 6)
2025       location /= 16;
2026
2027    /* Setup MRF[1] with location/offset into const buffer */
2028    brw_push_insn_state(p);
2029    brw_set_access_mode(p, BRW_ALIGN_1);
2030    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2031    brw_set_mask_control(p, BRW_MASK_DISABLE);
2032    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2033    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2034                      BRW_REGISTER_TYPE_UD),
2035            brw_imm_ud(location));
2036    brw_pop_insn_state(p);
2037
2038    insn = next_insn(p, BRW_OPCODE_SEND);
2039
2040    insn->header.predicate_control = BRW_PREDICATE_NONE;
2041    insn->header.compression_control = BRW_COMPRESSION_NONE;
2042    insn->header.destreg__conditionalmod = msg_reg_nr;
2043    insn->header.mask_control = BRW_MASK_DISABLE;
2044
2045    brw_set_dest(p, insn, dest);
2046    if (intel->gen >= 6) {
2047       brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2048    } else {
2049       brw_set_src0(p, insn, brw_null_reg());
2050    }
2051
2052    brw_set_dp_read_message(p,
2053                            insn,
2054                            bind_table_index,
2055                            0,
2056                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2057                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2058                            1, /* msg_length */
2059                            1); /* response_length (1 Oword) */
2060 }
2061
2062 /**
2063  * Read a float[4] constant per vertex from VS constant buffer, with
2064  * relative addressing.
2065  */
2066 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2067                                struct brw_reg dest,
2068                                struct brw_reg addr_reg,
2069                                GLuint offset,
2070                                GLuint bind_table_index)
2071 {
2072    struct intel_context *intel = &p->brw->intel;
2073    struct brw_reg src = brw_vec8_grf(0, 0);
2074    int msg_type;
2075
2076    /* Setup MRF[1] with offset into const buffer */
2077    brw_push_insn_state(p);
2078    brw_set_access_mode(p, BRW_ALIGN_1);
2079    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2080    brw_set_mask_control(p, BRW_MASK_DISABLE);
2081    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2082
2083    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2084     * fields ignored.
2085     */
2086    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2087            addr_reg, brw_imm_d(offset));
2088    brw_pop_insn_state(p);
2089
2090    gen6_resolve_implied_move(p, &src, 0);
2091    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2092
2093    insn->header.predicate_control = BRW_PREDICATE_NONE;
2094    insn->header.compression_control = BRW_COMPRESSION_NONE;
2095    insn->header.destreg__conditionalmod = 0;
2096    insn->header.mask_control = BRW_MASK_DISABLE;
2097
2098    brw_set_dest(p, insn, dest);
2099    brw_set_src0(p, insn, src);
2100
2101    if (intel->gen >= 6)
2102       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2103    else if (intel->gen == 5 || intel->is_g4x)
2104       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2105    else
2106       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2107
2108    brw_set_dp_read_message(p,
2109                            insn,
2110                            bind_table_index,
2111                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2112                            msg_type,
2113                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2114                            2, /* msg_length */
2115                            1); /* response_length */
2116 }
2117
2118
2119
2120 void brw_fb_WRITE(struct brw_compile *p,
2121                   int dispatch_width,
2122                   GLuint msg_reg_nr,
2123                   struct brw_reg src0,
2124                   GLuint binding_table_index,
2125                   GLuint msg_length,
2126                   GLuint response_length,
2127                   bool eot,
2128                   bool header_present)
2129 {
2130    struct intel_context *intel = &p->brw->intel;
2131    struct brw_instruction *insn;
2132    GLuint msg_control, msg_type;
2133    struct brw_reg dest;
2134
2135    if (dispatch_width == 16)
2136       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2137    else
2138       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2139
2140    if (intel->gen >= 6 && binding_table_index == 0) {
2141       insn = next_insn(p, BRW_OPCODE_SENDC);
2142    } else {
2143       insn = next_insn(p, BRW_OPCODE_SEND);
2144    }
2145    /* The execution mask is ignored for render target writes. */
2146    insn->header.predicate_control = 0;
2147    insn->header.compression_control = BRW_COMPRESSION_NONE;
2148
2149    if (intel->gen >= 6) {
2150       /* headerless version, just submit color payload */
2151       src0 = brw_message_reg(msg_reg_nr);
2152
2153       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2154    } else {
2155       insn->header.destreg__conditionalmod = msg_reg_nr;
2156
2157       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2158    }
2159
2160    if (dispatch_width == 16)
2161       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2162    else
2163       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2164
2165    brw_set_dest(p, insn, dest);
2166    brw_set_src0(p, insn, src0);
2167    brw_set_dp_write_message(p,
2168                             insn,
2169                             binding_table_index,
2170                             msg_control,
2171                             msg_type,
2172                             msg_length,
2173                             header_present,
2174                             1, /* last render target write */
2175                             response_length,
2176                             eot,
2177                             0 /* send_commit_msg */);
2178 }
2179
2180
2181 /**
2182  * Texture sample instruction.
2183  * Note: the msg_type plus msg_length values determine exactly what kind
2184  * of sampling operation is performed.  See volume 4, page 161 of docs.
2185  */
2186 void brw_SAMPLE(struct brw_compile *p,
2187                 struct brw_reg dest,
2188                 GLuint msg_reg_nr,
2189                 struct brw_reg src0,
2190                 GLuint binding_table_index,
2191                 GLuint sampler,
2192                 GLuint writemask,
2193                 GLuint msg_type,
2194                 GLuint response_length,
2195                 GLuint msg_length,
2196                 GLuint header_present,
2197                 GLuint simd_mode,
2198                 GLuint return_format)
2199 {
2200    struct intel_context *intel = &p->brw->intel;
2201    bool need_stall = 0;
2202
2203    if (writemask == 0) {
2204       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2205       return;
2206    }
2207
2208    /* Hardware doesn't do destination dependency checking on send
2209     * instructions properly.  Add a workaround which generates the
2210     * dependency by other means.  In practice it seems like this bug
2211     * only crops up for texture samples, and only where registers are
2212     * written by the send and then written again later without being
2213     * read in between.  Luckily for us, we already track that
2214     * information and use it to modify the writemask for the
2215     * instruction, so that is a guide for whether a workaround is
2216     * needed.
2217     */
2218    if (writemask != WRITEMASK_XYZW) {
2219       GLuint dst_offset = 0;
2220       GLuint i, newmask = 0, len = 0;
2221
2222       for (i = 0; i < 4; i++) {
2223          if (writemask & (1<<i))
2224             break;
2225          dst_offset += 2;
2226       }
2227       for (; i < 4; i++) {
2228          if (!(writemask & (1<<i)))
2229             break;
2230          newmask |= 1<<i;
2231          len++;
2232       }
2233
2234       if (newmask != writemask) {
2235          need_stall = 1;
2236          /* printf("need stall %x %x\n", newmask , writemask); */
2237       }
2238       else {
2239          bool dispatch_16 = false;
2240
2241          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2242
2243          guess_execution_size(p, p->current, dest);
2244          if (p->current->header.execution_size == BRW_EXECUTE_16)
2245             dispatch_16 = true;
2246
2247          newmask = ~newmask & WRITEMASK_XYZW;
2248
2249          brw_push_insn_state(p);
2250
2251          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2252          brw_set_mask_control(p, BRW_MASK_DISABLE);
2253
2254          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2255                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2256          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2257
2258          brw_pop_insn_state(p);
2259
2260          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2261          dest = offset(dest, dst_offset);
2262
2263          /* For 16-wide dispatch, masked channels are skipped in the
2264           * response.  For 8-wide, masked channels still take up slots,
2265           * and are just not written to.
2266           */
2267          if (dispatch_16)
2268             response_length = len * 2;
2269       }
2270    }
2271
2272    {
2273       struct brw_instruction *insn;
2274
2275       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2276
2277       insn = next_insn(p, BRW_OPCODE_SEND);
2278       insn->header.predicate_control = 0; /* XXX */
2279       insn->header.compression_control = BRW_COMPRESSION_NONE;
2280       if (intel->gen < 6)
2281           insn->header.destreg__conditionalmod = msg_reg_nr;
2282
2283       brw_set_dest(p, insn, dest);
2284       brw_set_src0(p, insn, src0);
2285       brw_set_sampler_message(p, insn,
2286                               binding_table_index,
2287                               sampler,
2288                               msg_type,
2289                               response_length,
2290                               msg_length,
2291                               header_present,
2292                               simd_mode,
2293                               return_format);
2294    }
2295
2296    if (need_stall) {
2297       struct brw_reg reg = vec8(offset(dest, response_length-1));
2298
2299       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2300        */
2301       brw_push_insn_state(p);
2302       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2303       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2304               retype(reg, BRW_REGISTER_TYPE_UD));
2305       brw_pop_insn_state(p);
2306    }
2307
2308 }
2309
2310 /* All these variables are pretty confusing - we might be better off
2311  * using bitmasks and macros for this, in the old style.  Or perhaps
2312  * just having the caller instantiate the fields in dword3 itself.
2313  */
2314 void brw_urb_WRITE(struct brw_compile *p,
2315                    struct brw_reg dest,
2316                    GLuint msg_reg_nr,
2317                    struct brw_reg src0,
2318                    bool allocate,
2319                    bool used,
2320                    GLuint msg_length,
2321                    GLuint response_length,
2322                    bool eot,
2323                    bool writes_complete,
2324                    GLuint offset,
2325                    GLuint swizzle)
2326 {
2327    struct intel_context *intel = &p->brw->intel;
2328    struct brw_instruction *insn;
2329
2330    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2331
2332    if (intel->gen == 7) {
2333       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2334       brw_push_insn_state(p);
2335       brw_set_access_mode(p, BRW_ALIGN_1);
2336       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2337                        BRW_REGISTER_TYPE_UD),
2338                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2339                 brw_imm_ud(0xff00));
2340       brw_pop_insn_state(p);
2341    }
2342
2343    insn = next_insn(p, BRW_OPCODE_SEND);
2344
2345    assert(msg_length < BRW_MAX_MRF);
2346
2347    brw_set_dest(p, insn, dest);
2348    brw_set_src0(p, insn, src0);
2349    brw_set_src1(p, insn, brw_imm_d(0));
2350
2351    if (intel->gen < 6)
2352       insn->header.destreg__conditionalmod = msg_reg_nr;
2353
2354    brw_set_urb_message(p,
2355                        insn,
2356                        allocate,
2357                        used,
2358                        msg_length,
2359                        response_length,
2360                        eot,
2361                        writes_complete,
2362                        offset,
2363                        swizzle);
2364 }
2365
2366 static int
2367 brw_find_next_block_end(struct brw_compile *p, int start)
2368 {
2369    int ip;
2370
2371    for (ip = start + 1; ip < p->nr_insn; ip++) {
2372       struct brw_instruction *insn = &p->store[ip];
2373
2374       switch (insn->header.opcode) {
2375       case BRW_OPCODE_ENDIF:
2376       case BRW_OPCODE_ELSE:
2377       case BRW_OPCODE_WHILE:
2378          return ip;
2379       }
2380    }
2381    assert(!"not reached");
2382    return start + 1;
2383 }
2384
2385 /* There is no DO instruction on gen6, so to find the end of the loop
2386  * we have to see if the loop is jumping back before our start
2387  * instruction.
2388  */
2389 static int
2390 brw_find_loop_end(struct brw_compile *p, int start)
2391 {
2392    struct intel_context *intel = &p->brw->intel;
2393    int ip;
2394    int br = 2;
2395
2396    for (ip = start + 1; ip < p->nr_insn; ip++) {
2397       struct brw_instruction *insn = &p->store[ip];
2398
2399       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2400          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2401                                    : insn->bits3.break_cont.jip;
2402          if (ip + jip / br <= start)
2403             return ip;
2404       }
2405    }
2406    assert(!"not reached");
2407    return start + 1;
2408 }
2409
2410 /* After program generation, go back and update the UIP and JIP of
2411  * BREAK and CONT instructions to their correct locations.
2412  */
2413 void
2414 brw_set_uip_jip(struct brw_compile *p)
2415 {
2416    struct intel_context *intel = &p->brw->intel;
2417    int ip;
2418    int br = 2;
2419
2420    if (intel->gen < 6)
2421       return;
2422
2423    for (ip = 0; ip < p->nr_insn; ip++) {
2424       struct brw_instruction *insn = &p->store[ip];
2425
2426       switch (insn->header.opcode) {
2427       case BRW_OPCODE_BREAK:
2428          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2429          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2430          insn->bits3.break_cont.uip =
2431             br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2432          break;
2433       case BRW_OPCODE_CONTINUE:
2434          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2435          insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2436
2437          assert(insn->bits3.break_cont.uip != 0);
2438          assert(insn->bits3.break_cont.jip != 0);
2439          break;
2440       }
2441    }
2442 }
2443
2444 void brw_ff_sync(struct brw_compile *p,
2445                    struct brw_reg dest,
2446                    GLuint msg_reg_nr,
2447                    struct brw_reg src0,
2448                    bool allocate,
2449                    GLuint response_length,
2450                    bool eot)
2451 {
2452    struct intel_context *intel = &p->brw->intel;
2453    struct brw_instruction *insn;
2454
2455    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2456
2457    insn = next_insn(p, BRW_OPCODE_SEND);
2458    brw_set_dest(p, insn, dest);
2459    brw_set_src0(p, insn, src0);
2460    brw_set_src1(p, insn, brw_imm_d(0));
2461
2462    if (intel->gen < 6)
2463       insn->header.destreg__conditionalmod = msg_reg_nr;
2464
2465    brw_set_ff_sync_message(p,
2466                            insn,
2467                            allocate,
2468                            response_length,
2469                            eot);
2470 }
2471
2472 /**
2473  * Emit the SEND instruction necessary to generate stream output data on Gen6
2474  * (for transform feedback).
2475  *
2476  * If send_commit_msg is true, this is the last piece of stream output data
2477  * from this thread, so send the data as a committed write.  According to the
2478  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2479  *
2480  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2481  *   writes are complete by sending the final write as a committed write."
2482  */
2483 void
2484 brw_svb_write(struct brw_compile *p,
2485               struct brw_reg dest,
2486               GLuint msg_reg_nr,
2487               struct brw_reg src0,
2488               GLuint binding_table_index,
2489               bool   send_commit_msg)
2490 {
2491    struct brw_instruction *insn;
2492
2493    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2494
2495    insn = next_insn(p, BRW_OPCODE_SEND);
2496    brw_set_dest(p, insn, dest);
2497    brw_set_src0(p, insn, src0);
2498    brw_set_src1(p, insn, brw_imm_d(0));
2499    brw_set_dp_write_message(p, insn,
2500                             binding_table_index,
2501                             0, /* msg_control: ignored */
2502                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2503                             1, /* msg_length */
2504                             true, /* header_present */
2505                             0, /* last_render_target: ignored */
2506                             send_commit_msg, /* response_length */
2507                             0, /* end_of_thread */
2508                             send_commit_msg); /* send_commit_msg */
2509 }