src/mesa/drivers/dri/i965/brw_eu_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "brw_context.h"
  34 #include "brw_defines.h"
  35 #include "brw_eu.h"
  36
  37 #include "glsl/ralloc.h"
  38
  39 /***********************************************************************
  40  * Internal helper for constructing instructions
  41  */
  42
  43 static void guess_execution_size(struct brw_compile *p,
  44                                  struct brw_instruction *insn,
  45                                  struct brw_reg reg)
  46 {
  47    if (reg.width == BRW_WIDTH_8 && p->compressed)
  48       insn->header.execution_size = BRW_EXECUTE_16;
  49    else
  50       insn->header.execution_size = reg.width;  /* note - definitions are compatible */
  51 }
  52
  53
  54 /**
  55  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  56  * registers, implicitly moving the operand to a message register.
  57  *
  58  * On Sandybridge, this is no longer the case.  This function performs the
  59  * explicit move; it should be called before emitting a SEND instruction.
  60  */
  61 void
  62 gen6_resolve_implied_move(struct brw_compile *p,
  63                           struct brw_reg *src,
  64                           GLuint msg_reg_nr)
  65 {
  66    struct intel_context *intel = &p->brw->intel;
  67    if (intel->gen < 6)
  68       return;
  69
  70    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  71       return;
  72
  73    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  74       brw_push_insn_state(p);
  75       brw_set_mask_control(p, BRW_MASK_DISABLE);
  76       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  77       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  78               retype(*src, BRW_REGISTER_TYPE_UD));
  79       brw_pop_insn_state(p);
  80    }
  81    *src = brw_message_reg(msg_reg_nr);
  82 }
  83
  84 static void
  85 gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
  86 {
  87    struct intel_context *intel = &p->brw->intel;
  88    if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  89       reg->file = BRW_GENERAL_REGISTER_FILE;
  90       reg->nr += 111;
  91    }
  92 }
  93
  94
  95 void
  96 brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
  97              struct brw_reg dest)
  98 {
  99    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
 100        dest.file != BRW_MESSAGE_REGISTER_FILE)
 101       assert(dest.nr < 128);
 102
 103    gen7_convert_mrf_to_grf(p, &dest);
 104
 105    insn->bits1.da1.dest_reg_file = dest.file;
 106    insn->bits1.da1.dest_reg_type = dest.type;
 107    insn->bits1.da1.dest_address_mode = dest.address_mode;
 108
 109    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
 110       insn->bits1.da1.dest_reg_nr = dest.nr;
 111
 112       if (insn->header.access_mode == BRW_ALIGN_1) {
 113          insn->bits1.da1.dest_subreg_nr = dest.subnr;
 114          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 115             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 116          insn->bits1.da1.dest_horiz_stride = dest.hstride;
 117       }
 118       else {
 119          insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
 120          insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
 121          /* even ignored in da16, still need to set as '01' */
 122          insn->bits1.da16.dest_horiz_stride = 1;
 123       }
 124    }
 125    else {
 126       insn->bits1.ia1.dest_subreg_nr = dest.subnr;
 127
 128       /* These are different sizes in align1 vs align16:
 129        */
 130       if (insn->header.access_mode == BRW_ALIGN_1) {
 131          insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 132          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 133             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
 134          insn->bits1.ia1.dest_horiz_stride = dest.hstride;
 135       }
 136       else {
 137          insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
 138          /* even ignored in da16, still need to set as '01' */
 139          insn->bits1.ia16.dest_horiz_stride = 1;
 140       }
 141    }
 142
 143    /* NEW: Set the execution size based on dest.width and
 144     * insn->compression_control:
 145     */
 146    guess_execution_size(p, insn, dest);
 147 }
 148
 149 extern int reg_type_size[];
 150
 151 static void
 152 validate_reg(struct brw_instruction *insn, struct brw_reg reg)
 153 {
 154    int hstride_for_reg[] = {0, 1, 2, 4};
 155    int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
 156    int width_for_reg[] = {1, 2, 4, 8, 16};
 157    int execsize_for_reg[] = {1, 2, 4, 8, 16};
 158    int width, hstride, vstride, execsize;
 159
 160    if (reg.file == BRW_IMMEDIATE_VALUE) {
 161       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
 162        * mean the destination has to be 128-bit aligned and the
 163        * destination horiz stride has to be a word.
 164        */
 165       if (reg.type == BRW_REGISTER_TYPE_V) {
 166          assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
 167                 reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
 168       }
 169
 170       return;
 171    }
 172
 173    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 174        reg.file == BRW_ARF_NULL)
 175       return;
 176
 177    assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
 178    hstride = hstride_for_reg[reg.hstride];
 179
 180    if (reg.vstride == 0xf) {
 181       vstride = -1;
 182    } else {
 183       assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
 184       vstride = vstride_for_reg[reg.vstride];
 185    }
 186
 187    assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
 188    width = width_for_reg[reg.width];
 189
 190    assert(insn->header.execution_size >= 0 &&
 191           insn->header.execution_size < Elements(execsize_for_reg));
 192    execsize = execsize_for_reg[insn->header.execution_size];
 193
 194    /* Restrictions from 3.3.10: Register Region Restrictions. */
 195    /* 3. */
 196    assert(execsize >= width);
 197
 198    /* 4. */
 199    if (execsize == width && hstride != 0) {
 200       assert(vstride == -1 || vstride == width * hstride);
 201    }
 202
 203    /* 5. */
 204    if (execsize == width && hstride == 0) {
 205       /* no restriction on vstride. */
 206    }
 207
 208    /* 6. */
 209    if (width == 1) {
 210       assert(hstride == 0);
 211    }
 212
 213    /* 7. */
 214    if (execsize == 1 && width == 1) {
 215       assert(hstride == 0);
 216       assert(vstride == 0);
 217    }
 218
 219    /* 8. */
 220    if (vstride == 0 && hstride == 0) {
 221       assert(width == 1);
 222    }
 223
 224    /* 10. Check destination issues. */
 225 }
 226
 227 void
 228 brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
 229              struct brw_reg reg)
 230 {
 231    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
 232       assert(reg.nr < 128);
 233
 234    gen7_convert_mrf_to_grf(p, &reg);
 235
 236    validate_reg(insn, reg);
 237
 238    insn->bits1.da1.src0_reg_file = reg.file;
 239    insn->bits1.da1.src0_reg_type = reg.type;
 240    insn->bits2.da1.src0_abs = reg.abs;
 241    insn->bits2.da1.src0_negate = reg.negate;
 242    insn->bits2.da1.src0_address_mode = reg.address_mode;
 243
 244    if (reg.file == BRW_IMMEDIATE_VALUE) {
 245       insn->bits3.ud = reg.dw1.ud;
 246
 247       /* Required to set some fields in src1 as well:
 248        */
 249       insn->bits1.da1.src1_reg_file = 0; /* arf */
 250       insn->bits1.da1.src1_reg_type = reg.type;
 251    }
 252    else
 253    {
 254       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
 255          if (insn->header.access_mode == BRW_ALIGN_1) {
 256             insn->bits2.da1.src0_subreg_nr = reg.subnr;
 257             insn->bits2.da1.src0_reg_nr = reg.nr;
 258          }
 259          else {
 260             insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
 261             insn->bits2.da16.src0_reg_nr = reg.nr;
 262          }
 263       }
 264       else {
 265          insn->bits2.ia1.src0_subreg_nr = reg.subnr;
 266
 267          if (insn->header.access_mode == BRW_ALIGN_1) {
 268             insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
 269          }
 270          else {
 271             insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
 272          }
 273       }
 274
 275       if (insn->header.access_mode == BRW_ALIGN_1) {
 276          if (reg.width == BRW_WIDTH_1 &&
 277              insn->header.execution_size == BRW_EXECUTE_1) {
 278             insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 279             insn->bits2.da1.src0_width = BRW_WIDTH_1;
 280             insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
 281          }
 282          else {
 283             insn->bits2.da1.src0_horiz_stride = reg.hstride;
 284             insn->bits2.da1.src0_width = reg.width;
 285             insn->bits2.da1.src0_vert_stride = reg.vstride;
 286          }
 287       }
 288       else {
 289          insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 290          insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 291          insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 292          insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 293
 294          /* This is an oddity of the fact we're using the same
 295           * descriptions for registers in align_16 as align_1:
 296           */
 297          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 298             insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
 299          else
 300             insn->bits2.da16.src0_vert_stride = reg.vstride;
 301       }
 302    }
 303 }
 304
 305
 306 void brw_set_src1(struct brw_compile *p,
 307                   struct brw_instruction *insn,
 308                   struct brw_reg reg)
 309 {
 310    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 311
 312    assert(reg.nr < 128);
 313
 314    gen7_convert_mrf_to_grf(p, &reg);
 315
 316    validate_reg(insn, reg);
 317
 318    insn->bits1.da1.src1_reg_file = reg.file;
 319    insn->bits1.da1.src1_reg_type = reg.type;
 320    insn->bits3.da1.src1_abs = reg.abs;
 321    insn->bits3.da1.src1_negate = reg.negate;
 322
 323    /* Only src1 can be immediate in two-argument instructions.
 324     */
 325    assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
 326
 327    if (reg.file == BRW_IMMEDIATE_VALUE) {
 328       insn->bits3.ud = reg.dw1.ud;
 329    }
 330    else {
 331       /* This is a hardware restriction, which may or may not be lifted
 332        * in the future:
 333        */
 334       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
 335       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 336
 337       if (insn->header.access_mode == BRW_ALIGN_1) {
 338          insn->bits3.da1.src1_subreg_nr = reg.subnr;
 339          insn->bits3.da1.src1_reg_nr = reg.nr;
 340       }
 341       else {
 342          insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
 343          insn->bits3.da16.src1_reg_nr = reg.nr;
 344       }
 345
 346       if (insn->header.access_mode == BRW_ALIGN_1) {
 347          if (reg.width == BRW_WIDTH_1 &&
 348              insn->header.execution_size == BRW_EXECUTE_1) {
 349             insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
 350             insn->bits3.da1.src1_width = BRW_WIDTH_1;
 351             insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
 352          }
 353          else {
 354             insn->bits3.da1.src1_horiz_stride = reg.hstride;
 355             insn->bits3.da1.src1_width = reg.width;
 356             insn->bits3.da1.src1_vert_stride = reg.vstride;
 357          }
 358       }
 359       else {
 360          insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
 361          insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
 362          insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
 363          insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
 364
 365          /* This is an oddity of the fact we're using the same
 366           * descriptions for registers in align_16 as align_1:
 367           */
 368          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
 369             insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
 370          else
 371             insn->bits3.da16.src1_vert_stride = reg.vstride;
 372       }
 373    }
 374 }
 375
 376 /**
 377  * Set the Message Descriptor and Extended Message Descriptor fields
 378  * for SEND messages.
 379  *
 380  * \note This zeroes out the Function Control bits, so it must be called
 381  *       \b before filling out any message-specific data.  Callers can
 382  *       choose not to fill in irrelevant bits; they will be zero.
 383  */
 384 static void
 385 brw_set_message_descriptor(struct brw_compile *p,
 386                            struct brw_instruction *inst,
 387                            enum brw_message_target sfid,
 388                            unsigned msg_length,
 389                            unsigned response_length,
 390                            bool header_present,
 391                            bool end_of_thread)
 392 {
 393    struct intel_context *intel = &p->brw->intel;
 394
 395    brw_set_src1(p, inst, brw_imm_d(0));
 396
 397    if (intel->gen >= 5) {
 398       inst->bits3.generic_gen5.header_present = header_present;
 399       inst->bits3.generic_gen5.response_length = response_length;
 400       inst->bits3.generic_gen5.msg_length = msg_length;
 401       inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 402
 403       if (intel->gen >= 6) {
 404          /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 405          inst->header.destreg__conditionalmod = sfid;
 406       } else {
 407          /* Set Extended Message Descriptor (ex_desc) */
 408          inst->bits2.send_gen5.sfid = sfid;
 409          inst->bits2.send_gen5.end_of_thread = end_of_thread;
 410       }
 411    } else {
 412       inst->bits3.generic.response_length = response_length;
 413       inst->bits3.generic.msg_length = msg_length;
 414       inst->bits3.generic.msg_target = sfid;
 415       inst->bits3.generic.end_of_thread = end_of_thread;
 416    }
 417 }
 418
 419 static void brw_set_math_message( struct brw_compile *p,
 420                                   struct brw_instruction *insn,
 421                                   GLuint function,
 422                                   GLuint integer_type,
 423                                   bool low_precision,
 424                                   bool saturate,
 425                                   GLuint dataType )
 426 {
 427    struct brw_context *brw = p->brw;
 428    struct intel_context *intel = &brw->intel;
 429    unsigned msg_length;
 430    unsigned response_length;
 431
 432    /* Infer message length from the function */
 433    switch (function) {
 434    case BRW_MATH_FUNCTION_POW:
 435    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
 436    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
 437    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 438       msg_length = 2;
 439       break;
 440    default:
 441       msg_length = 1;
 442       break;
 443    }
 444
 445    /* Infer response length from the function */
 446    switch (function) {
 447    case BRW_MATH_FUNCTION_SINCOS:
 448    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
 449       response_length = 2;
 450       break;
 451    default:
 452       response_length = 1;
 453       break;
 454    }
 455
 456    brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 457                               msg_length, response_length, false, false);
 458    if (intel->gen == 5) {
 459       insn->bits3.math_gen5.function = function;
 460       insn->bits3.math_gen5.int_type = integer_type;
 461       insn->bits3.math_gen5.precision = low_precision;
 462       insn->bits3.math_gen5.saturate = saturate;
 463       insn->bits3.math_gen5.data_type = dataType;
 464       insn->bits3.math_gen5.snapshot = 0;
 465    } else {
 466       insn->bits3.math.function = function;
 467       insn->bits3.math.int_type = integer_type;
 468       insn->bits3.math.precision = low_precision;
 469       insn->bits3.math.saturate = saturate;
 470       insn->bits3.math.data_type = dataType;
 471    }
 472 }
 473
 474
 475 static void brw_set_ff_sync_message(struct brw_compile *p,
 476                                     struct brw_instruction *insn,
 477                                     bool allocate,
 478                                     GLuint response_length,
 479                                     bool end_of_thread)
 480 {
 481    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 482                               1, response_length, true, end_of_thread);
 483    insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
 484    insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
 485    insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
 486    insn->bits3.urb_gen5.allocate = allocate;
 487    insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
 488    insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
 489 }
 490
 491 static void brw_set_urb_message( struct brw_compile *p,
 492                                  struct brw_instruction *insn,
 493                                  bool allocate,
 494                                  bool used,
 495                                  GLuint msg_length,
 496                                  GLuint response_length,
 497                                  bool end_of_thread,
 498                                  bool complete,
 499                                  GLuint offset,
 500                                  GLuint swizzle_control )
 501 {
 502    struct brw_context *brw = p->brw;
 503    struct intel_context *intel = &brw->intel;
 504
 505    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 506                               msg_length, response_length, true, end_of_thread);
 507    if (intel->gen == 7) {
 508       insn->bits3.urb_gen7.opcode = 0;  /* URB_WRITE_HWORD */
 509       insn->bits3.urb_gen7.offset = offset;
 510       assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
 511       insn->bits3.urb_gen7.swizzle_control = swizzle_control;
 512       /* per_slot_offset = 0 makes it ignore offsets in message header */
 513       insn->bits3.urb_gen7.per_slot_offset = 0;
 514       insn->bits3.urb_gen7.complete = complete;
 515    } else if (intel->gen >= 5) {
 516       insn->bits3.urb_gen5.opcode = 0;  /* URB_WRITE */
 517       insn->bits3.urb_gen5.offset = offset;
 518       insn->bits3.urb_gen5.swizzle_control = swizzle_control;
 519       insn->bits3.urb_gen5.allocate = allocate;
 520       insn->bits3.urb_gen5.used = used; /* ? */
 521       insn->bits3.urb_gen5.complete = complete;
 522    } else {
 523       insn->bits3.urb.opcode = 0;       /* ? */
 524       insn->bits3.urb.offset = offset;
 525       insn->bits3.urb.swizzle_control = swizzle_control;
 526       insn->bits3.urb.allocate = allocate;
 527       insn->bits3.urb.used = used;      /* ? */
 528       insn->bits3.urb.complete = complete;
 529    }
 530 }
 531
 532 void
 533 brw_set_dp_write_message(struct brw_compile *p,
 534                          struct brw_instruction *insn,
 535                          GLuint binding_table_index,
 536                          GLuint msg_control,
 537                          GLuint msg_type,
 538                          GLuint msg_length,
 539                          bool header_present,
 540                          GLuint last_render_target,
 541                          GLuint response_length,
 542                          GLuint end_of_thread,
 543                          GLuint send_commit_msg)
 544 {
 545    struct brw_context *brw = p->brw;
 546    struct intel_context *intel = &brw->intel;
 547    unsigned sfid;
 548
 549    if (intel->gen >= 7) {
 550       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
 551       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 552          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 553       else
 554          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 555    } else if (intel->gen == 6) {
 556       /* Use the render cache for all write messages. */
 557       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 558    } else {
 559       sfid = BRW_SFID_DATAPORT_WRITE;
 560    }
 561
 562    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 563                               header_present, end_of_thread);
 564
 565    if (intel->gen >= 7) {
 566       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 567       insn->bits3.gen7_dp.msg_control = msg_control;
 568       insn->bits3.gen7_dp.last_render_target = last_render_target;
 569       insn->bits3.gen7_dp.msg_type = msg_type;
 570    } else if (intel->gen == 6) {
 571       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 572       insn->bits3.gen6_dp.msg_control = msg_control;
 573       insn->bits3.gen6_dp.last_render_target = last_render_target;
 574       insn->bits3.gen6_dp.msg_type = msg_type;
 575       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
 576    } else if (intel->gen == 5) {
 577       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 578       insn->bits3.dp_write_gen5.msg_control = msg_control;
 579       insn->bits3.dp_write_gen5.last_render_target = last_render_target;
 580       insn->bits3.dp_write_gen5.msg_type = msg_type;
 581       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
 582    } else {
 583       insn->bits3.dp_write.binding_table_index = binding_table_index;
 584       insn->bits3.dp_write.msg_control = msg_control;
 585       insn->bits3.dp_write.last_render_target = last_render_target;
 586       insn->bits3.dp_write.msg_type = msg_type;
 587       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
 588    }
 589 }
 590
 591 void
 592 brw_set_dp_read_message(struct brw_compile *p,
 593                         struct brw_instruction *insn,
 594                         GLuint binding_table_index,
 595                         GLuint msg_control,
 596                         GLuint msg_type,
 597                         GLuint target_cache,
 598                         GLuint msg_length,
 599                         GLuint response_length)
 600 {
 601    struct brw_context *brw = p->brw;
 602    struct intel_context *intel = &brw->intel;
 603    unsigned sfid;
 604
 605    if (intel->gen >= 7) {
 606       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
 607    } else if (intel->gen == 6) {
 608       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 609          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 610       else
 611          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
 612    } else {
 613       sfid = BRW_SFID_DATAPORT_READ;
 614    }
 615
 616    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
 617                               true, false);
 618
 619    if (intel->gen >= 7) {
 620       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 621       insn->bits3.gen7_dp.msg_control = msg_control;
 622       insn->bits3.gen7_dp.last_render_target = 0;
 623       insn->bits3.gen7_dp.msg_type = msg_type;
 624    } else if (intel->gen == 6) {
 625       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 626       insn->bits3.gen6_dp.msg_control = msg_control;
 627       insn->bits3.gen6_dp.last_render_target = 0;
 628       insn->bits3.gen6_dp.msg_type = msg_type;
 629       insn->bits3.gen6_dp.send_commit_msg = 0;
 630    } else if (intel->gen == 5) {
 631       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 632       insn->bits3.dp_read_gen5.msg_control = msg_control;
 633       insn->bits3.dp_read_gen5.msg_type = msg_type;
 634       insn->bits3.dp_read_gen5.target_cache = target_cache;
 635    } else if (intel->is_g4x) {
 636       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 637       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 638       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
 639       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
 640    } else {
 641       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
 642       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
 643       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
 644       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
 645    }
 646 }
 647
 648 static void brw_set_sampler_message(struct brw_compile *p,
 649                                     struct brw_instruction *insn,
 650                                     GLuint binding_table_index,
 651                                     GLuint sampler,
 652                                     GLuint msg_type,
 653                                     GLuint response_length,
 654                                     GLuint msg_length,
 655                                     GLuint header_present,
 656                                     GLuint simd_mode,
 657                                     GLuint return_format)
 658 {
 659    struct brw_context *brw = p->brw;
 660    struct intel_context *intel = &brw->intel;
 661
 662    brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
 663                               response_length, header_present, false);
 664
 665    if (intel->gen >= 7) {
 666       insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 667       insn->bits3.sampler_gen7.sampler = sampler;
 668       insn->bits3.sampler_gen7.msg_type = msg_type;
 669       insn->bits3.sampler_gen7.simd_mode = simd_mode;
 670    } else if (intel->gen >= 5) {
 671       insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 672       insn->bits3.sampler_gen5.sampler = sampler;
 673       insn->bits3.sampler_gen5.msg_type = msg_type;
 674       insn->bits3.sampler_gen5.simd_mode = simd_mode;
 675    } else if (intel->is_g4x) {
 676       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 677       insn->bits3.sampler_g4x.sampler = sampler;
 678       insn->bits3.sampler_g4x.msg_type = msg_type;
 679    } else {
 680       insn->bits3.sampler.binding_table_index = binding_table_index;
 681       insn->bits3.sampler.sampler = sampler;
 682       insn->bits3.sampler.msg_type = msg_type;
 683       insn->bits3.sampler.return_format = return_format;
 684    }
 685 }
 686
 687
 688 #define next_insn brw_next_insn
 689 struct brw_instruction *
 690 brw_next_insn(struct brw_compile *p, GLuint opcode)
 691 {
 692    struct brw_instruction *insn;
 693
 694    assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
 695
 696    insn = &p->store[p->nr_insn++];
 697    memcpy(insn, p->current, sizeof(*insn));
 698
 699    /* Reset this one-shot flag:
 700     */
 701
 702    if (p->current->header.destreg__conditionalmod) {
 703       p->current->header.destreg__conditionalmod = 0;
 704       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
 705    }
 706
 707    insn->header.opcode = opcode;
 708    return insn;
 709 }
 710
 711 static struct brw_instruction *brw_alu1( struct brw_compile *p,
 712                                          GLuint opcode,
 713                                          struct brw_reg dest,
 714                                          struct brw_reg src )
 715 {
 716    struct brw_instruction *insn = next_insn(p, opcode);
 717    brw_set_dest(p, insn, dest);
 718    brw_set_src0(p, insn, src);
 719    return insn;
 720 }
 721
 722 static struct brw_instruction *brw_alu2(struct brw_compile *p,
 723                                         GLuint opcode,
 724                                         struct brw_reg dest,
 725                                         struct brw_reg src0,
 726                                         struct brw_reg src1 )
 727 {
 728    struct brw_instruction *insn = next_insn(p, opcode);
 729    brw_set_dest(p, insn, dest);
 730    brw_set_src0(p, insn, src0);
 731    brw_set_src1(p, insn, src1);
 732    return insn;
 733 }
 734
 735
 736 /***********************************************************************
 737  * Convenience routines.
 738  */
 739 #define ALU1(OP)                                        \
 740 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 741               struct brw_reg dest,                      \
 742               struct brw_reg src0)                      \
 743 {                                                       \
 744    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
 745 }
 746
 747 #define ALU2(OP)                                        \
 748 struct brw_instruction *brw_##OP(struct brw_compile *p, \
 749               struct brw_reg dest,                      \
 750               struct brw_reg src0,                      \
 751               struct brw_reg src1)                      \
 752 {                                                       \
 753    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
 754 }
 755
 756 /* Rounding operations (other than RNDD) require two instructions - the first
 757  * stores a rounded value (possibly the wrong way) in the dest register, but
 758  * also sets a per-channel "increment bit" in the flag register.  A predicated
 759  * add of 1.0 fixes dest to contain the desired result.
 760  *
 761  * Sandybridge and later appear to round correctly without an ADD.
 762  */
 763 #define ROUND(OP)                                                             \
 764 void brw_##OP(struct brw_compile *p,                                          \
 765               struct brw_reg dest,                                            \
 766               struct brw_reg src)                                             \
 767 {                                                                             \
 768    struct brw_instruction *rnd, *add;                                         \
 769    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
 770    brw_set_dest(p, rnd, dest);                                                \
 771    brw_set_src0(p, rnd, src);                                                 \
 772                                                                               \
 773    if (p->brw->intel.gen < 6) {                                               \
 774       /* turn on round-increments */                                          \
 775       rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;                \
 776       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
 777       add->header.predicate_control = BRW_PREDICATE_NORMAL;                   \
 778    }                                                                          \
 779 }
 780
 781
 782 ALU1(MOV)
 783 ALU2(SEL)
 784 ALU1(NOT)
 785 ALU2(AND)
 786 ALU2(OR)
 787 ALU2(XOR)
 788 ALU2(SHR)
 789 ALU2(SHL)
 790 ALU2(RSR)
 791 ALU2(RSL)
 792 ALU2(ASR)
 793 ALU1(FRC)
 794 ALU1(RNDD)
 795 ALU2(MAC)
 796 ALU2(MACH)
 797 ALU1(LZD)
 798 ALU2(DP4)
 799 ALU2(DPH)
 800 ALU2(DP3)
 801 ALU2(DP2)
 802 ALU2(LINE)
 803 ALU2(PLN)
 804
 805
 806 ROUND(RNDZ)
 807 ROUND(RNDE)
 808
 809
 810 struct brw_instruction *brw_ADD(struct brw_compile *p,
 811                                 struct brw_reg dest,
 812                                 struct brw_reg src0,
 813                                 struct brw_reg src1)
 814 {
 815    /* 6.2.2: add */
 816    if (src0.type == BRW_REGISTER_TYPE_F ||
 817        (src0.file == BRW_IMMEDIATE_VALUE &&
 818         src0.type == BRW_REGISTER_TYPE_VF)) {
 819       assert(src1.type != BRW_REGISTER_TYPE_UD);
 820       assert(src1.type != BRW_REGISTER_TYPE_D);
 821    }
 822
 823    if (src1.type == BRW_REGISTER_TYPE_F ||
 824        (src1.file == BRW_IMMEDIATE_VALUE &&
 825         src1.type == BRW_REGISTER_TYPE_VF)) {
 826       assert(src0.type != BRW_REGISTER_TYPE_UD);
 827       assert(src0.type != BRW_REGISTER_TYPE_D);
 828    }
 829
 830    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
 831 }
 832
 833 struct brw_instruction *brw_MUL(struct brw_compile *p,
 834                                 struct brw_reg dest,
 835                                 struct brw_reg src0,
 836                                 struct brw_reg src1)
 837 {
 838    /* 6.32.38: mul */
 839    if (src0.type == BRW_REGISTER_TYPE_D ||
 840        src0.type == BRW_REGISTER_TYPE_UD ||
 841        src1.type == BRW_REGISTER_TYPE_D ||
 842        src1.type == BRW_REGISTER_TYPE_UD) {
 843       assert(dest.type != BRW_REGISTER_TYPE_F);
 844    }
 845
 846    if (src0.type == BRW_REGISTER_TYPE_F ||
 847        (src0.file == BRW_IMMEDIATE_VALUE &&
 848         src0.type == BRW_REGISTER_TYPE_VF)) {
 849       assert(src1.type != BRW_REGISTER_TYPE_UD);
 850       assert(src1.type != BRW_REGISTER_TYPE_D);
 851    }
 852
 853    if (src1.type == BRW_REGISTER_TYPE_F ||
 854        (src1.file == BRW_IMMEDIATE_VALUE &&
 855         src1.type == BRW_REGISTER_TYPE_VF)) {
 856       assert(src0.type != BRW_REGISTER_TYPE_UD);
 857       assert(src0.type != BRW_REGISTER_TYPE_D);
 858    }
 859
 860    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 861           src0.nr != BRW_ARF_ACCUMULATOR);
 862    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
 863           src1.nr != BRW_ARF_ACCUMULATOR);
 864
 865    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
 866 }
 867
 868
 869 void brw_NOP(struct brw_compile *p)
 870 {
 871    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
 872    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 873    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 874    brw_set_src1(p, insn, brw_imm_ud(0x0));
 875 }
 876
 877
 878
 879
 880
 881 /***********************************************************************
 882  * Comparisons, if/else/endif
 883  */
 884
 885 struct brw_instruction *brw_JMPI(struct brw_compile *p,
 886                                  struct brw_reg dest,
 887                                  struct brw_reg src0,
 888                                  struct brw_reg src1)
 889 {
 890    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 891
 892    insn->header.execution_size = 1;
 893    insn->header.compression_control = BRW_COMPRESSION_NONE;
 894    insn->header.mask_control = BRW_MASK_DISABLE;
 895
 896    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 897
 898    return insn;
 899 }
 900
 901 static void
 902 push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
 903 {
 904    p->if_stack[p->if_stack_depth] = inst;
 905
 906    p->if_stack_depth++;
 907    if (p->if_stack_array_size <= p->if_stack_depth) {
 908       p->if_stack_array_size *= 2;
 909       p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
 910                              p->if_stack_array_size);
 911    }
 912 }
 913
 914 static void
 915 push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
 916 {
 917    if (p->loop_stack_array_size < p->loop_stack_depth) {
 918       p->loop_stack_array_size *= 2;
 919       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
 920                                p->loop_stack_array_size);
 921       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
 922                                      p->loop_stack_array_size);
 923    }
 924
 925    p->loop_stack[p->loop_stack_depth] = inst - p->store;
 926    p->loop_stack_depth++;
 927    p->if_depth_in_loop[p->loop_stack_depth] = 0;
 928 }
 929
 930 static struct brw_instruction *
 931 get_inner_do_insn(struct brw_compile *p)
 932 {
 933    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
 934 }
 935
 936 /* EU takes the value from the flag register and pushes it onto some
 937  * sort of a stack (presumably merging with any flag value already on
 938  * the stack).  Within an if block, the flags at the top of the stack
 939  * control execution on each channel of the unit, eg. on each of the
 940  * 16 pixel values in our wm programs.
 941  *
 942  * When the matching 'else' instruction is reached (presumably by
 943  * countdown of the instruction count patched in by our ELSE/ENDIF
 944  * functions), the relevent flags are inverted.
 945  *
 946  * When the matching 'endif' instruction is reached, the flags are
 947  * popped off.  If the stack is now empty, normal execution resumes.
 948  */
 949 struct brw_instruction *
 950 brw_IF(struct brw_compile *p, GLuint execute_size)
 951 {
 952    struct intel_context *intel = &p->brw->intel;
 953    struct brw_instruction *insn;
 954
 955    insn = next_insn(p, BRW_OPCODE_IF);
 956
 957    /* Override the defaults for this instruction:
 958     */
 959    if (intel->gen < 6) {
 960       brw_set_dest(p, insn, brw_ip_reg());
 961       brw_set_src0(p, insn, brw_ip_reg());
 962       brw_set_src1(p, insn, brw_imm_d(0x0));
 963    } else if (intel->gen == 6) {
 964       brw_set_dest(p, insn, brw_imm_w(0));
 965       insn->bits1.branch_gen6.jump_count = 0;
 966       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 967       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 968    } else {
 969       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 970       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
 971       brw_set_src1(p, insn, brw_imm_ud(0));
 972       insn->bits3.break_cont.jip = 0;
 973       insn->bits3.break_cont.uip = 0;
 974    }
 975
 976    insn->header.execution_size = execute_size;
 977    insn->header.compression_control = BRW_COMPRESSION_NONE;
 978    insn->header.predicate_control = BRW_PREDICATE_NORMAL;
 979    insn->header.mask_control = BRW_MASK_ENABLE;
 980    if (!p->single_program_flow)
 981       insn->header.thread_control = BRW_THREAD_SWITCH;
 982
 983    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 984
 985    push_if_stack(p, insn);
 986    p->if_depth_in_loop[p->loop_stack_depth]++;
 987    return insn;
 988 }
 989
 990 /* This function is only used for gen6-style IF instructions with an
 991  * embedded comparison (conditional modifier).  It is not used on gen7.
 992  */
 993 struct brw_instruction *
 994 gen6_IF(struct brw_compile *p, uint32_t conditional,
 995         struct brw_reg src0, struct brw_reg src1)
 996 {
 997    struct brw_instruction *insn;
 998
 999    insn = next_insn(p, BRW_OPCODE_IF);
1000
1001    brw_set_dest(p, insn, brw_imm_w(0));
1002    if (p->compressed) {
1003       insn->header.execution_size = BRW_EXECUTE_16;
1004    } else {
1005       insn->header.execution_size = BRW_EXECUTE_8;
1006    }
1007    insn->bits1.branch_gen6.jump_count = 0;
1008    brw_set_src0(p, insn, src0);
1009    brw_set_src1(p, insn, src1);
1010
1011    assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1012    assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1013    insn->header.destreg__conditionalmod = conditional;
1014
1015    if (!p->single_program_flow)
1016       insn->header.thread_control = BRW_THREAD_SWITCH;
1017
1018    push_if_stack(p, insn);
1019    return insn;
1020 }
1021
1022 /**
1023  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1024  */
1025 static void
1026 convert_IF_ELSE_to_ADD(struct brw_compile *p,
1027                        struct brw_instruction *if_inst,
1028                        struct brw_instruction *else_inst)
1029 {
1030    /* The next instruction (where the ENDIF would be, if it existed) */
1031    struct brw_instruction *next_inst = &p->store[p->nr_insn];
1032
1033    assert(p->single_program_flow);
1034    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1035    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1036    assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1037
1038    /* Convert IF to an ADD instruction that moves the instruction pointer
1039     * to the first instruction of the ELSE block.  If there is no ELSE
1040     * block, point to where ENDIF would be.  Reverse the predicate.
1041     *
1042     * There's no need to execute an ENDIF since we don't need to do any
1043     * stack operations, and if we're currently executing, we just want to
1044     * continue normally.
1045     */
1046    if_inst->header.opcode = BRW_OPCODE_ADD;
1047    if_inst->header.predicate_inverse = 1;
1048
1049    if (else_inst != NULL) {
1050       /* Convert ELSE to an ADD instruction that points where the ENDIF
1051        * would be.
1052        */
1053       else_inst->header.opcode = BRW_OPCODE_ADD;
1054
1055       if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1056       else_inst->bits3.ud = (next_inst - else_inst) * 16;
1057    } else {
1058       if_inst->bits3.ud = (next_inst - if_inst) * 16;
1059    }
1060 }
1061
1062 /**
1063  * Patch IF and ELSE instructions with appropriate jump targets.
1064  */
1065 static void
1066 patch_IF_ELSE(struct brw_compile *p,
1067               struct brw_instruction *if_inst,
1068               struct brw_instruction *else_inst,
1069               struct brw_instruction *endif_inst)
1070 {
1071    struct intel_context *intel = &p->brw->intel;
1072
1073    /* We shouldn't be patching IF and ELSE instructions in single program flow
1074     * mode when gen < 6, because in single program flow mode on those
1075     * platforms, we convert flow control instructions to conditional ADDs that
1076     * operate on IP (see brw_ENDIF).
1077     *
1078     * However, on Gen6, writing to IP doesn't work in single program flow mode
1079     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1080     * not be updated by non-flow control instructions.").  And on later
1081     * platforms, there is no significant benefit to converting control flow
1082     * instructions to conditional ADDs.  So we do patch IF and ELSE
1083     * instructions in single program flow mode on those platforms.
1084     */
1085    if (intel->gen < 6)
1086       assert(!p->single_program_flow);
1087
1088    assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1089    assert(endif_inst != NULL);
1090    assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1091
1092    unsigned br = 1;
1093    /* Jump count is for 64bit data chunk each, so one 128bit instruction
1094     * requires 2 chunks.
1095     */
1096    if (intel->gen >= 5)
1097       br = 2;
1098
1099    assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1100    endif_inst->header.execution_size = if_inst->header.execution_size;
1101
1102    if (else_inst == NULL) {
1103       /* Patch IF -> ENDIF */
1104       if (intel->gen < 6) {
1105          /* Turn it into an IFF, which means no mask stack operations for
1106           * all-false and jumping past the ENDIF.
1107           */
1108          if_inst->header.opcode = BRW_OPCODE_IFF;
1109          if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1110          if_inst->bits3.if_else.pop_count = 0;
1111          if_inst->bits3.if_else.pad0 = 0;
1112       } else if (intel->gen == 6) {
1113          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1114          if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1115       } else {
1116          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1117          if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1118       }
1119    } else {
1120       else_inst->header.execution_size = if_inst->header.execution_size;
1121
1122       /* Patch IF -> ELSE */
1123       if (intel->gen < 6) {
1124          if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1125          if_inst->bits3.if_else.pop_count = 0;
1126          if_inst->bits3.if_else.pad0 = 0;
1127       } else if (intel->gen == 6) {
1128          if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1129       }
1130
1131       /* Patch ELSE -> ENDIF */
1132       if (intel->gen < 6) {
1133          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1134           * matching ENDIF.
1135           */
1136          else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1137          else_inst->bits3.if_else.pop_count = 1;
1138          else_inst->bits3.if_else.pad0 = 0;
1139       } else if (intel->gen == 6) {
1140          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1141          else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1142       } else {
1143          /* The IF instruction's JIP should point just past the ELSE */
1144          if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1145          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1146          if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1147          else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1148       }
1149    }
1150 }
1151
1152 void
1153 brw_ELSE(struct brw_compile *p)
1154 {
1155    struct intel_context *intel = &p->brw->intel;
1156    struct brw_instruction *insn;
1157
1158    insn = next_insn(p, BRW_OPCODE_ELSE);
1159
1160    if (intel->gen < 6) {
1161       brw_set_dest(p, insn, brw_ip_reg());
1162       brw_set_src0(p, insn, brw_ip_reg());
1163       brw_set_src1(p, insn, brw_imm_d(0x0));
1164    } else if (intel->gen == 6) {
1165       brw_set_dest(p, insn, brw_imm_w(0));
1166       insn->bits1.branch_gen6.jump_count = 0;
1167       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1168       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1169    } else {
1170       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1171       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1172       brw_set_src1(p, insn, brw_imm_ud(0));
1173       insn->bits3.break_cont.jip = 0;
1174       insn->bits3.break_cont.uip = 0;
1175    }
1176
1177    insn->header.compression_control = BRW_COMPRESSION_NONE;
1178    insn->header.mask_control = BRW_MASK_ENABLE;
1179    if (!p->single_program_flow)
1180       insn->header.thread_control = BRW_THREAD_SWITCH;
1181
1182    push_if_stack(p, insn);
1183 }
1184
1185 void
1186 brw_ENDIF(struct brw_compile *p)
1187 {
1188    struct intel_context *intel = &p->brw->intel;
1189    struct brw_instruction *insn;
1190    struct brw_instruction *else_inst = NULL;
1191    struct brw_instruction *if_inst = NULL;
1192
1193    /* Pop the IF and (optional) ELSE instructions from the stack */
1194    p->if_depth_in_loop[p->loop_stack_depth]--;
1195    p->if_stack_depth--;
1196    if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1197       else_inst = p->if_stack[p->if_stack_depth];
1198       p->if_stack_depth--;
1199    }
1200    if_inst = p->if_stack[p->if_stack_depth];
1201
1202    /* In single program flow mode, we can express IF and ELSE instructions
1203     * equivalently as ADD instructions that operate on IP.  On platforms prior
1204     * to Gen6, flow control instructions cause an implied thread switch, so
1205     * this is a significant savings.
1206     *
1207     * However, on Gen6, writing to IP doesn't work in single program flow mode
1208     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1209     * not be updated by non-flow control instructions.").  And on later
1210     * platforms, there is no significant benefit to converting control flow
1211     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1212     * Gen5.
1213     */
1214    if (intel->gen < 6 && p->single_program_flow) {
1215       /* ENDIF is useless; don't bother emitting it. */
1216       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1217       return;
1218    }
1219
1220    insn = next_insn(p, BRW_OPCODE_ENDIF);
1221
1222    if (intel->gen < 6) {
1223       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1224       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1225       brw_set_src1(p, insn, brw_imm_d(0x0));
1226    } else if (intel->gen == 6) {
1227       brw_set_dest(p, insn, brw_imm_w(0));
1228       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1229       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1230    } else {
1231       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1232       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1233       brw_set_src1(p, insn, brw_imm_ud(0));
1234    }
1235
1236    insn->header.compression_control = BRW_COMPRESSION_NONE;
1237    insn->header.mask_control = BRW_MASK_ENABLE;
1238    insn->header.thread_control = BRW_THREAD_SWITCH;
1239
1240    /* Also pop item off the stack in the endif instruction: */
1241    if (intel->gen < 6) {
1242       insn->bits3.if_else.jump_count = 0;
1243       insn->bits3.if_else.pop_count = 1;
1244       insn->bits3.if_else.pad0 = 0;
1245    } else if (intel->gen == 6) {
1246       insn->bits1.branch_gen6.jump_count = 2;
1247    } else {
1248       insn->bits3.break_cont.jip = 2;
1249    }
1250    patch_IF_ELSE(p, if_inst, else_inst, insn);
1251 }
1252
1253 struct brw_instruction *brw_BREAK(struct brw_compile *p)
1254 {
1255    struct intel_context *intel = &p->brw->intel;
1256    struct brw_instruction *insn;
1257
1258    insn = next_insn(p, BRW_OPCODE_BREAK);
1259    if (intel->gen >= 6) {
1260       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1261       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1262       brw_set_src1(p, insn, brw_imm_d(0x0));
1263    } else {
1264       brw_set_dest(p, insn, brw_ip_reg());
1265       brw_set_src0(p, insn, brw_ip_reg());
1266       brw_set_src1(p, insn, brw_imm_d(0x0));
1267       insn->bits3.if_else.pad0 = 0;
1268       insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1269    }
1270    insn->header.compression_control = BRW_COMPRESSION_NONE;
1271    insn->header.execution_size = BRW_EXECUTE_8;
1272
1273    return insn;
1274 }
1275
1276 struct brw_instruction *gen6_CONT(struct brw_compile *p)
1277 {
1278    struct brw_instruction *insn;
1279
1280    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1281    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1282    brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1283    brw_set_dest(p, insn, brw_ip_reg());
1284    brw_set_src0(p, insn, brw_ip_reg());
1285    brw_set_src1(p, insn, brw_imm_d(0x0));
1286
1287    insn->header.compression_control = BRW_COMPRESSION_NONE;
1288    insn->header.execution_size = BRW_EXECUTE_8;
1289    return insn;
1290 }
1291
1292 struct brw_instruction *brw_CONT(struct brw_compile *p)
1293 {
1294    struct brw_instruction *insn;
1295    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1296    brw_set_dest(p, insn, brw_ip_reg());
1297    brw_set_src0(p, insn, brw_ip_reg());
1298    brw_set_src1(p, insn, brw_imm_d(0x0));
1299    insn->header.compression_control = BRW_COMPRESSION_NONE;
1300    insn->header.execution_size = BRW_EXECUTE_8;
1301    /* insn->header.mask_control = BRW_MASK_DISABLE; */
1302    insn->bits3.if_else.pad0 = 0;
1303    insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1304    return insn;
1305 }
1306
1307 /* DO/WHILE loop:
1308  *
1309  * The DO/WHILE is just an unterminated loop -- break or continue are
1310  * used for control within the loop.  We have a few ways they can be
1311  * done.
1312  *
1313  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1314  * jip and no DO instruction.
1315  *
1316  * For non-uniform control flow pre-gen6, there's a DO instruction to
1317  * push the mask, and a WHILE to jump back, and BREAK to get out and
1318  * pop the mask.
1319  *
1320  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1321  * just points back to the first instruction of the loop.
1322  */
1323 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1324 {
1325    struct intel_context *intel = &p->brw->intel;
1326
1327    if (intel->gen >= 6 || p->single_program_flow) {
1328       push_loop_stack(p, &p->store[p->nr_insn]);
1329       return &p->store[p->nr_insn];
1330    } else {
1331       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1332
1333       push_loop_stack(p, insn);
1334
1335       /* Override the defaults for this instruction:
1336        */
1337       brw_set_dest(p, insn, brw_null_reg());
1338       brw_set_src0(p, insn, brw_null_reg());
1339       brw_set_src1(p, insn, brw_null_reg());
1340
1341       insn->header.compression_control = BRW_COMPRESSION_NONE;
1342       insn->header.execution_size = execute_size;
1343       insn->header.predicate_control = BRW_PREDICATE_NONE;
1344       /* insn->header.mask_control = BRW_MASK_ENABLE; */
1345       /* insn->header.mask_control = BRW_MASK_DISABLE; */
1346
1347       return insn;
1348    }
1349 }
1350
1351 /**
1352  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1353  * instruction here.
1354  *
1355  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1356  * nesting, since it can always just point to the end of the block/current loop.
1357  */
1358 static void
1359 brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1360 {
1361    struct intel_context *intel = &p->brw->intel;
1362    struct brw_instruction *do_inst = get_inner_do_insn(p);
1363    struct brw_instruction *inst;
1364    int br = (intel->gen == 5) ? 2 : 1;
1365
1366    for (inst = while_inst - 1; inst != do_inst; inst--) {
1367       /* If the jump count is != 0, that means that this instruction has already
1368        * been patched because it's part of a loop inside of the one we're
1369        * patching.
1370        */
1371       if (inst->header.opcode == BRW_OPCODE_BREAK &&
1372           inst->bits3.if_else.jump_count == 0) {
1373          inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1374       } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1375                  inst->bits3.if_else.jump_count == 0) {
1376          inst->bits3.if_else.jump_count = br * (while_inst - inst);
1377       }
1378    }
1379 }
1380
1381 struct brw_instruction *brw_WHILE(struct brw_compile *p)
1382 {
1383    struct intel_context *intel = &p->brw->intel;
1384    struct brw_instruction *insn, *do_insn;
1385    GLuint br = 1;
1386
1387    do_insn = get_inner_do_insn(p);
1388
1389    if (intel->gen >= 5)
1390       br = 2;
1391
1392    if (intel->gen >= 7) {
1393       insn = next_insn(p, BRW_OPCODE_WHILE);
1394
1395       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1396       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1397       brw_set_src1(p, insn, brw_imm_ud(0));
1398       insn->bits3.break_cont.jip = br * (do_insn - insn);
1399
1400       insn->header.execution_size = BRW_EXECUTE_8;
1401    } else if (intel->gen == 6) {
1402       insn = next_insn(p, BRW_OPCODE_WHILE);
1403
1404       brw_set_dest(p, insn, brw_imm_w(0));
1405       insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1406       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1407       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1408
1409       insn->header.execution_size = BRW_EXECUTE_8;
1410    } else {
1411       if (p->single_program_flow) {
1412          insn = next_insn(p, BRW_OPCODE_ADD);
1413
1414          brw_set_dest(p, insn, brw_ip_reg());
1415          brw_set_src0(p, insn, brw_ip_reg());
1416          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1417          insn->header.execution_size = BRW_EXECUTE_1;
1418       } else {
1419          insn = next_insn(p, BRW_OPCODE_WHILE);
1420
1421          assert(do_insn->header.opcode == BRW_OPCODE_DO);
1422
1423          brw_set_dest(p, insn, brw_ip_reg());
1424          brw_set_src0(p, insn, brw_ip_reg());
1425          brw_set_src1(p, insn, brw_imm_d(0));
1426
1427          insn->header.execution_size = do_insn->header.execution_size;
1428          insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1429          insn->bits3.if_else.pop_count = 0;
1430          insn->bits3.if_else.pad0 = 0;
1431
1432          brw_patch_break_cont(p, insn);
1433       }
1434    }
1435    insn->header.compression_control = BRW_COMPRESSION_NONE;
1436    p->current->header.predicate_control = BRW_PREDICATE_NONE;
1437
1438    p->loop_stack_depth--;
1439
1440    return insn;
1441 }
1442
1443
1444 /* FORWARD JUMPS:
1445  */
1446 void brw_land_fwd_jump(struct brw_compile *p,
1447                        struct brw_instruction *jmp_insn)
1448 {
1449    struct intel_context *intel = &p->brw->intel;
1450    struct brw_instruction *landing = &p->store[p->nr_insn];
1451    GLuint jmpi = 1;
1452
1453    if (intel->gen >= 5)
1454       jmpi = 2;
1455
1456    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1457    assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1458
1459    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1460 }
1461
1462
1463
1464 /* To integrate with the above, it makes sense that the comparison
1465  * instruction should populate the flag register.  It might be simpler
1466  * just to use the flag reg for most WM tasks?
1467  */
1468 void brw_CMP(struct brw_compile *p,
1469              struct brw_reg dest,
1470              GLuint conditional,
1471              struct brw_reg src0,
1472              struct brw_reg src1)
1473 {
1474    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1475
1476    insn->header.destreg__conditionalmod = conditional;
1477    brw_set_dest(p, insn, dest);
1478    brw_set_src0(p, insn, src0);
1479    brw_set_src1(p, insn, src1);
1480
1481 /*    guess_execution_size(insn, src0); */
1482
1483
1484    /* Make it so that future instructions will use the computed flag
1485     * value until brw_set_predicate_control_flag_value() is called
1486     * again.
1487     */
1488    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1489        dest.nr == 0) {
1490       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1491       p->flag_value = 0xff;
1492    }
1493 }
1494
1495 /* Issue 'wait' instruction for n1, host could program MMIO
1496    to wake up thread. */
1497 void brw_WAIT (struct brw_compile *p)
1498 {
1499    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1500    struct brw_reg src = brw_notification_1_reg();
1501
1502    brw_set_dest(p, insn, src);
1503    brw_set_src0(p, insn, src);
1504    brw_set_src1(p, insn, brw_null_reg());
1505    insn->header.execution_size = 0; /* must */
1506    insn->header.predicate_control = 0;
1507    insn->header.compression_control = 0;
1508 }
1509
1510
1511 /***********************************************************************
1512  * Helpers for the various SEND message types:
1513  */
1514
1515 /** Extended math function, float[8].
1516  */
1517 void brw_math( struct brw_compile *p,
1518                struct brw_reg dest,
1519                GLuint function,
1520                GLuint saturate,
1521                GLuint msg_reg_nr,
1522                struct brw_reg src,
1523                GLuint data_type,
1524                GLuint precision )
1525 {
1526    struct intel_context *intel = &p->brw->intel;
1527
1528    if (intel->gen >= 6) {
1529       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1530
1531       assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1532       assert(src.file == BRW_GENERAL_REGISTER_FILE);
1533
1534       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1535       if (intel->gen == 6)
1536          assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1537
1538       /* Source modifiers are ignored for extended math instructions on Gen6. */
1539       if (intel->gen == 6) {
1540          assert(!src.negate);
1541          assert(!src.abs);
1542       }
1543
1544       if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1545           function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1546           function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1547          assert(src.type != BRW_REGISTER_TYPE_F);
1548       } else {
1549          assert(src.type == BRW_REGISTER_TYPE_F);
1550       }
1551
1552       /* Math is the same ISA format as other opcodes, except that CondModifier
1553        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1554        */
1555       insn->header.destreg__conditionalmod = function;
1556       insn->header.saturate = saturate;
1557
1558       brw_set_dest(p, insn, dest);
1559       brw_set_src0(p, insn, src);
1560       brw_set_src1(p, insn, brw_null_reg());
1561    } else {
1562       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1563
1564       /* Example code doesn't set predicate_control for send
1565        * instructions.
1566        */
1567       insn->header.predicate_control = 0;
1568       insn->header.destreg__conditionalmod = msg_reg_nr;
1569
1570       brw_set_dest(p, insn, dest);
1571       brw_set_src0(p, insn, src);
1572       brw_set_math_message(p,
1573                            insn,
1574                            function,
1575                            src.type == BRW_REGISTER_TYPE_D,
1576                            precision,
1577                            saturate,
1578                            data_type);
1579    }
1580 }
1581
1582 /** Extended math function, float[8].
1583  */
1584 void brw_math2(struct brw_compile *p,
1585                struct brw_reg dest,
1586                GLuint function,
1587                struct brw_reg src0,
1588                struct brw_reg src1)
1589 {
1590    struct intel_context *intel = &p->brw->intel;
1591    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1592
1593    assert(intel->gen >= 6);
1594    (void) intel;
1595
1596
1597    assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1598    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1599    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1600
1601    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1602    if (intel->gen == 6) {
1603       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1604       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1605    }
1606
1607    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1608        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1609        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1610       assert(src0.type != BRW_REGISTER_TYPE_F);
1611       assert(src1.type != BRW_REGISTER_TYPE_F);
1612    } else {
1613       assert(src0.type == BRW_REGISTER_TYPE_F);
1614       assert(src1.type == BRW_REGISTER_TYPE_F);
1615    }
1616
1617    /* Source modifiers are ignored for extended math instructions on Gen6. */
1618    if (intel->gen == 6) {
1619       assert(!src0.negate);
1620       assert(!src0.abs);
1621       assert(!src1.negate);
1622       assert(!src1.abs);
1623    }
1624
1625    /* Math is the same ISA format as other opcodes, except that CondModifier
1626     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1627     */
1628    insn->header.destreg__conditionalmod = function;
1629
1630    brw_set_dest(p, insn, dest);
1631    brw_set_src0(p, insn, src0);
1632    brw_set_src1(p, insn, src1);
1633 }
1634
1635 /**
1636  * Extended math function, float[16].
1637  * Use 2 send instructions.
1638  */
1639 void brw_math_16( struct brw_compile *p,
1640                   struct brw_reg dest,
1641                   GLuint function,
1642                   GLuint saturate,
1643                   GLuint msg_reg_nr,
1644                   struct brw_reg src,
1645                   GLuint precision )
1646 {
1647    struct intel_context *intel = &p->brw->intel;
1648    struct brw_instruction *insn;
1649
1650    if (intel->gen >= 6) {
1651       insn = next_insn(p, BRW_OPCODE_MATH);
1652
1653       /* Math is the same ISA format as other opcodes, except that CondModifier
1654        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1655        */
1656       insn->header.destreg__conditionalmod = function;
1657       insn->header.saturate = saturate;
1658
1659       /* Source modifiers are ignored for extended math instructions. */
1660       assert(!src.negate);
1661       assert(!src.abs);
1662
1663       brw_set_dest(p, insn, dest);
1664       brw_set_src0(p, insn, src);
1665       brw_set_src1(p, insn, brw_null_reg());
1666       return;
1667    }
1668
1669    /* First instruction:
1670     */
1671    brw_push_insn_state(p);
1672    brw_set_predicate_control_flag_value(p, 0xff);
1673    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1674
1675    insn = next_insn(p, BRW_OPCODE_SEND);
1676    insn->header.destreg__conditionalmod = msg_reg_nr;
1677
1678    brw_set_dest(p, insn, dest);
1679    brw_set_src0(p, insn, src);
1680    brw_set_math_message(p,
1681                         insn,
1682                         function,
1683                         BRW_MATH_INTEGER_UNSIGNED,
1684                         precision,
1685                         saturate,
1686                         BRW_MATH_DATA_VECTOR);
1687
1688    /* Second instruction:
1689     */
1690    insn = next_insn(p, BRW_OPCODE_SEND);
1691    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1692    insn->header.destreg__conditionalmod = msg_reg_nr+1;
1693
1694    brw_set_dest(p, insn, offset(dest,1));
1695    brw_set_src0(p, insn, src);
1696    brw_set_math_message(p,
1697                         insn,
1698                         function,
1699                         BRW_MATH_INTEGER_UNSIGNED,
1700                         precision,
1701                         saturate,
1702                         BRW_MATH_DATA_VECTOR);
1703
1704    brw_pop_insn_state(p);
1705 }
1706
1707
1708 /**
1709  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1710  * using a constant offset per channel.
1711  *
1712  * The offset must be aligned to oword size (16 bytes).  Used for
1713  * register spilling.
1714  */
1715 void brw_oword_block_write_scratch(struct brw_compile *p,
1716                                    struct brw_reg mrf,
1717                                    int num_regs,
1718                                    GLuint offset)
1719 {
1720    struct intel_context *intel = &p->brw->intel;
1721    uint32_t msg_control, msg_type;
1722    int mlen;
1723
1724    if (intel->gen >= 6)
1725       offset /= 16;
1726
1727    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1728
1729    if (num_regs == 1) {
1730       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1731       mlen = 2;
1732    } else {
1733       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1734       mlen = 3;
1735    }
1736
1737    /* Set up the message header.  This is g0, with g0.2 filled with
1738     * the offset.  We don't want to leave our offset around in g0 or
1739     * it'll screw up texture samples, so set it up inside the message
1740     * reg.
1741     */
1742    {
1743       brw_push_insn_state(p);
1744       brw_set_mask_control(p, BRW_MASK_DISABLE);
1745       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1746
1747       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1748
1749       /* set message header global offset field (reg 0, element 2) */
1750       brw_MOV(p,
1751               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1752                                   mrf.nr,
1753                                   2), BRW_REGISTER_TYPE_UD),
1754               brw_imm_ud(offset));
1755
1756       brw_pop_insn_state(p);
1757    }
1758
1759    {
1760       struct brw_reg dest;
1761       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1762       int send_commit_msg;
1763       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1764                                          BRW_REGISTER_TYPE_UW);
1765
1766       if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1767          insn->header.compression_control = BRW_COMPRESSION_NONE;
1768          src_header = vec16(src_header);
1769       }
1770       assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1771       insn->header.destreg__conditionalmod = mrf.nr;
1772
1773       /* Until gen6, writes followed by reads from the same location
1774        * are not guaranteed to be ordered unless write_commit is set.
1775        * If set, then a no-op write is issued to the destination
1776        * register to set a dependency, and a read from the destination
1777        * can be used to ensure the ordering.
1778        *
1779        * For gen6, only writes between different threads need ordering
1780        * protection.  Our use of DP writes is all about register
1781        * spilling within a thread.
1782        */
1783       if (intel->gen >= 6) {
1784          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1785          send_commit_msg = 0;
1786       } else {
1787          dest = src_header;
1788          send_commit_msg = 1;
1789       }
1790
1791       brw_set_dest(p, insn, dest);
1792       if (intel->gen >= 6) {
1793          brw_set_src0(p, insn, mrf);
1794       } else {
1795          brw_set_src0(p, insn, brw_null_reg());
1796       }
1797
1798       if (intel->gen >= 6)
1799          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1800       else
1801          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1802
1803       brw_set_dp_write_message(p,
1804                                insn,
1805                                255, /* binding table index (255=stateless) */
1806                                msg_control,
1807                                msg_type,
1808                                mlen,
1809                                true, /* header_present */
1810                                0, /* not a render target */
1811                                send_commit_msg, /* response_length */
1812                                0, /* eot */
1813                                send_commit_msg);
1814    }
1815 }
1816
1817
1818 /**
1819  * Read a block of owords (half a GRF each) from the scratch buffer
1820  * using a constant index per channel.
1821  *
1822  * Offset must be aligned to oword size (16 bytes).  Used for register
1823  * spilling.
1824  */
1825 void
1826 brw_oword_block_read_scratch(struct brw_compile *p,
1827                              struct brw_reg dest,
1828                              struct brw_reg mrf,
1829                              int num_regs,
1830                              GLuint offset)
1831 {
1832    struct intel_context *intel = &p->brw->intel;
1833    uint32_t msg_control;
1834    int rlen;
1835
1836    if (intel->gen >= 6)
1837       offset /= 16;
1838
1839    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1840    dest = retype(dest, BRW_REGISTER_TYPE_UW);
1841
1842    if (num_regs == 1) {
1843       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1844       rlen = 1;
1845    } else {
1846       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1847       rlen = 2;
1848    }
1849
1850    {
1851       brw_push_insn_state(p);
1852       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1853       brw_set_mask_control(p, BRW_MASK_DISABLE);
1854
1855       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1856
1857       /* set message header global offset field (reg 0, element 2) */
1858       brw_MOV(p,
1859               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1860                                   mrf.nr,
1861                                   2), BRW_REGISTER_TYPE_UD),
1862               brw_imm_ud(offset));
1863
1864       brw_pop_insn_state(p);
1865    }
1866
1867    {
1868       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1869
1870       assert(insn->header.predicate_control == 0);
1871       insn->header.compression_control = BRW_COMPRESSION_NONE;
1872       insn->header.destreg__conditionalmod = mrf.nr;
1873
1874       brw_set_dest(p, insn, dest);      /* UW? */
1875       if (intel->gen >= 6) {
1876          brw_set_src0(p, insn, mrf);
1877       } else {
1878          brw_set_src0(p, insn, brw_null_reg());
1879       }
1880
1881       brw_set_dp_read_message(p,
1882                               insn,
1883                               255, /* binding table index (255=stateless) */
1884                               msg_control,
1885                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1886                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1887                               1, /* msg_length */
1888                               rlen);
1889    }
1890 }
1891
1892 /**
1893  * Read a float[4] vector from the data port Data Cache (const buffer).
1894  * Location (in buffer) should be a multiple of 16.
1895  * Used for fetching shader constants.
1896  */
1897 void brw_oword_block_read(struct brw_compile *p,
1898                           struct brw_reg dest,
1899                           struct brw_reg mrf,
1900                           uint32_t offset,
1901                           uint32_t bind_table_index)
1902 {
1903    struct intel_context *intel = &p->brw->intel;
1904
1905    /* On newer hardware, offset is in units of owords. */
1906    if (intel->gen >= 6)
1907       offset /= 16;
1908
1909    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1910
1911    brw_push_insn_state(p);
1912    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1913    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1914    brw_set_mask_control(p, BRW_MASK_DISABLE);
1915
1916    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1917
1918    /* set message header global offset field (reg 0, element 2) */
1919    brw_MOV(p,
1920            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1921                                mrf.nr,
1922                                2), BRW_REGISTER_TYPE_UD),
1923            brw_imm_ud(offset));
1924
1925    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1926    insn->header.destreg__conditionalmod = mrf.nr;
1927
1928    /* cast dest to a uword[8] vector */
1929    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1930
1931    brw_set_dest(p, insn, dest);
1932    if (intel->gen >= 6) {
1933       brw_set_src0(p, insn, mrf);
1934    } else {
1935       brw_set_src0(p, insn, brw_null_reg());
1936    }
1937
1938    brw_set_dp_read_message(p,
1939                            insn,
1940                            bind_table_index,
1941                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1942                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1943                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1944                            1, /* msg_length */
1945                            1); /* response_length (1 reg, 2 owords!) */
1946
1947    brw_pop_insn_state(p);
1948 }
1949
1950 /**
1951  * Read a set of dwords from the data port Data Cache (const buffer).
1952  *
1953  * Location (in buffer) appears as UD offsets in the register after
1954  * the provided mrf header reg.
1955  */
1956 void brw_dword_scattered_read(struct brw_compile *p,
1957                               struct brw_reg dest,
1958                               struct brw_reg mrf,
1959                               uint32_t bind_table_index)
1960 {
1961    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1962
1963    brw_push_insn_state(p);
1964    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1965    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1966    brw_set_mask_control(p, BRW_MASK_DISABLE);
1967    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1968    brw_pop_insn_state(p);
1969
1970    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1971    insn->header.destreg__conditionalmod = mrf.nr;
1972
1973    /* cast dest to a uword[8] vector */
1974    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1975
1976    brw_set_dest(p, insn, dest);
1977    brw_set_src0(p, insn, brw_null_reg());
1978
1979    brw_set_dp_read_message(p,
1980                            insn,
1981                            bind_table_index,
1982                            BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1983                            BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1984                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1985                            2, /* msg_length */
1986                            1); /* response_length */
1987 }
1988
1989
1990
1991 /**
1992  * Read float[4] constant(s) from VS constant buffer.
1993  * For relative addressing, two float[4] constants will be read into 'dest'.
1994  * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1995  */
1996 void brw_dp_READ_4_vs(struct brw_compile *p,
1997                       struct brw_reg dest,
1998                       GLuint location,
1999                       GLuint bind_table_index)
2000 {
2001    struct intel_context *intel = &p->brw->intel;
2002    struct brw_instruction *insn;
2003    GLuint msg_reg_nr = 1;
2004
2005    if (intel->gen >= 6)
2006       location /= 16;
2007
2008    /* Setup MRF[1] with location/offset into const buffer */
2009    brw_push_insn_state(p);
2010    brw_set_access_mode(p, BRW_ALIGN_1);
2011    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2012    brw_set_mask_control(p, BRW_MASK_DISABLE);
2013    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2014    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2015                      BRW_REGISTER_TYPE_UD),
2016            brw_imm_ud(location));
2017    brw_pop_insn_state(p);
2018
2019    insn = next_insn(p, BRW_OPCODE_SEND);
2020
2021    insn->header.predicate_control = BRW_PREDICATE_NONE;
2022    insn->header.compression_control = BRW_COMPRESSION_NONE;
2023    insn->header.destreg__conditionalmod = msg_reg_nr;
2024    insn->header.mask_control = BRW_MASK_DISABLE;
2025
2026    brw_set_dest(p, insn, dest);
2027    if (intel->gen >= 6) {
2028       brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2029    } else {
2030       brw_set_src0(p, insn, brw_null_reg());
2031    }
2032
2033    brw_set_dp_read_message(p,
2034                            insn,
2035                            bind_table_index,
2036                            0,
2037                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2038                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2039                            1, /* msg_length */
2040                            1); /* response_length (1 Oword) */
2041 }
2042
2043 /**
2044  * Read a float[4] constant per vertex from VS constant buffer, with
2045  * relative addressing.
2046  */
2047 void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2048                                struct brw_reg dest,
2049                                struct brw_reg addr_reg,
2050                                GLuint offset,
2051                                GLuint bind_table_index)
2052 {
2053    struct intel_context *intel = &p->brw->intel;
2054    struct brw_reg src = brw_vec8_grf(0, 0);
2055    int msg_type;
2056
2057    /* Setup MRF[1] with offset into const buffer */
2058    brw_push_insn_state(p);
2059    brw_set_access_mode(p, BRW_ALIGN_1);
2060    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2061    brw_set_mask_control(p, BRW_MASK_DISABLE);
2062    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2063
2064    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2065     * fields ignored.
2066     */
2067    brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2068            addr_reg, brw_imm_d(offset));
2069    brw_pop_insn_state(p);
2070
2071    gen6_resolve_implied_move(p, &src, 0);
2072    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2073
2074    insn->header.predicate_control = BRW_PREDICATE_NONE;
2075    insn->header.compression_control = BRW_COMPRESSION_NONE;
2076    insn->header.destreg__conditionalmod = 0;
2077    insn->header.mask_control = BRW_MASK_DISABLE;
2078
2079    brw_set_dest(p, insn, dest);
2080    brw_set_src0(p, insn, src);
2081
2082    if (intel->gen >= 6)
2083       msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2084    else if (intel->gen == 5 || intel->is_g4x)
2085       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2086    else
2087       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2088
2089    brw_set_dp_read_message(p,
2090                            insn,
2091                            bind_table_index,
2092                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2093                            msg_type,
2094                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2095                            2, /* msg_length */
2096                            1); /* response_length */
2097 }
2098
2099
2100
2101 void brw_fb_WRITE(struct brw_compile *p,
2102                   int dispatch_width,
2103                   GLuint msg_reg_nr,
2104                   struct brw_reg src0,
2105                   GLuint binding_table_index,
2106                   GLuint msg_length,
2107                   GLuint response_length,
2108                   bool eot,
2109                   bool header_present)
2110 {
2111    struct intel_context *intel = &p->brw->intel;
2112    struct brw_instruction *insn;
2113    GLuint msg_control, msg_type;
2114    struct brw_reg dest;
2115
2116    if (dispatch_width == 16)
2117       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2118    else
2119       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2120
2121    if (intel->gen >= 6 && binding_table_index == 0) {
2122       insn = next_insn(p, BRW_OPCODE_SENDC);
2123    } else {
2124       insn = next_insn(p, BRW_OPCODE_SEND);
2125    }
2126    /* The execution mask is ignored for render target writes. */
2127    insn->header.predicate_control = 0;
2128    insn->header.compression_control = BRW_COMPRESSION_NONE;
2129
2130    if (intel->gen >= 6) {
2131       /* headerless version, just submit color payload */
2132       src0 = brw_message_reg(msg_reg_nr);
2133
2134       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2135    } else {
2136       insn->header.destreg__conditionalmod = msg_reg_nr;
2137
2138       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2139    }
2140
2141    if (dispatch_width == 16)
2142       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2143    else
2144       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2145
2146    brw_set_dest(p, insn, dest);
2147    brw_set_src0(p, insn, src0);
2148    brw_set_dp_write_message(p,
2149                             insn,
2150                             binding_table_index,
2151                             msg_control,
2152                             msg_type,
2153                             msg_length,
2154                             header_present,
2155                             1, /* last render target write */
2156                             response_length,
2157                             eot,
2158                             0 /* send_commit_msg */);
2159 }
2160
2161
2162 /**
2163  * Texture sample instruction.
2164  * Note: the msg_type plus msg_length values determine exactly what kind
2165  * of sampling operation is performed.  See volume 4, page 161 of docs.
2166  */
2167 void brw_SAMPLE(struct brw_compile *p,
2168                 struct brw_reg dest,
2169                 GLuint msg_reg_nr,
2170                 struct brw_reg src0,
2171                 GLuint binding_table_index,
2172                 GLuint sampler,
2173                 GLuint writemask,
2174                 GLuint msg_type,
2175                 GLuint response_length,
2176                 GLuint msg_length,
2177                 GLuint header_present,
2178                 GLuint simd_mode,
2179                 GLuint return_format)
2180 {
2181    struct intel_context *intel = &p->brw->intel;
2182    bool need_stall = 0;
2183
2184    if (writemask == 0) {
2185       /*printf("%s: zero writemask??\n", __FUNCTION__); */
2186       return;
2187    }
2188
2189    /* Hardware doesn't do destination dependency checking on send
2190     * instructions properly.  Add a workaround which generates the
2191     * dependency by other means.  In practice it seems like this bug
2192     * only crops up for texture samples, and only where registers are
2193     * written by the send and then written again later without being
2194     * read in between.  Luckily for us, we already track that
2195     * information and use it to modify the writemask for the
2196     * instruction, so that is a guide for whether a workaround is
2197     * needed.
2198     */
2199    if (writemask != WRITEMASK_XYZW) {
2200       GLuint dst_offset = 0;
2201       GLuint i, newmask = 0, len = 0;
2202
2203       for (i = 0; i < 4; i++) {
2204          if (writemask & (1<<i))
2205             break;
2206          dst_offset += 2;
2207       }
2208       for (; i < 4; i++) {
2209          if (!(writemask & (1<<i)))
2210             break;
2211          newmask |= 1<<i;
2212          len++;
2213       }
2214
2215       if (newmask != writemask) {
2216          need_stall = 1;
2217          /* printf("need stall %x %x\n", newmask , writemask); */
2218       }
2219       else {
2220          bool dispatch_16 = false;
2221
2222          struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2223
2224          guess_execution_size(p, p->current, dest);
2225          if (p->current->header.execution_size == BRW_EXECUTE_16)
2226             dispatch_16 = true;
2227
2228          newmask = ~newmask & WRITEMASK_XYZW;
2229
2230          brw_push_insn_state(p);
2231
2232          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2233          brw_set_mask_control(p, BRW_MASK_DISABLE);
2234
2235          brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2236                  retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2237          brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2238
2239          brw_pop_insn_state(p);
2240
2241          src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2242          dest = offset(dest, dst_offset);
2243
2244          /* For 16-wide dispatch, masked channels are skipped in the
2245           * response.  For 8-wide, masked channels still take up slots,
2246           * and are just not written to.
2247           */
2248          if (dispatch_16)
2249             response_length = len * 2;
2250       }
2251    }
2252
2253    {
2254       struct brw_instruction *insn;
2255
2256       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2257
2258       insn = next_insn(p, BRW_OPCODE_SEND);
2259       insn->header.predicate_control = 0; /* XXX */
2260       insn->header.compression_control = BRW_COMPRESSION_NONE;
2261       if (intel->gen < 6)
2262           insn->header.destreg__conditionalmod = msg_reg_nr;
2263
2264       brw_set_dest(p, insn, dest);
2265       brw_set_src0(p, insn, src0);
2266       brw_set_sampler_message(p, insn,
2267                               binding_table_index,
2268                               sampler,
2269                               msg_type,
2270                               response_length,
2271                               msg_length,
2272                               header_present,
2273                               simd_mode,
2274                               return_format);
2275    }
2276
2277    if (need_stall) {
2278       struct brw_reg reg = vec8(offset(dest, response_length-1));
2279
2280       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2281        */
2282       brw_push_insn_state(p);
2283       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2284       brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2285               retype(reg, BRW_REGISTER_TYPE_UD));
2286       brw_pop_insn_state(p);
2287    }
2288
2289 }
2290
2291 /* All these variables are pretty confusing - we might be better off
2292  * using bitmasks and macros for this, in the old style.  Or perhaps
2293  * just having the caller instantiate the fields in dword3 itself.
2294  */
2295 void brw_urb_WRITE(struct brw_compile *p,
2296                    struct brw_reg dest,
2297                    GLuint msg_reg_nr,
2298                    struct brw_reg src0,
2299                    bool allocate,
2300                    bool used,
2301                    GLuint msg_length,
2302                    GLuint response_length,
2303                    bool eot,
2304                    bool writes_complete,
2305                    GLuint offset,
2306                    GLuint swizzle)
2307 {
2308    struct intel_context *intel = &p->brw->intel;
2309    struct brw_instruction *insn;
2310
2311    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2312
2313    if (intel->gen == 7) {
2314       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2315       brw_push_insn_state(p);
2316       brw_set_access_mode(p, BRW_ALIGN_1);
2317       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2318                        BRW_REGISTER_TYPE_UD),
2319                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2320                 brw_imm_ud(0xff00));
2321       brw_pop_insn_state(p);
2322    }
2323
2324    insn = next_insn(p, BRW_OPCODE_SEND);
2325
2326    assert(msg_length < BRW_MAX_MRF);
2327
2328    brw_set_dest(p, insn, dest);
2329    brw_set_src0(p, insn, src0);
2330    brw_set_src1(p, insn, brw_imm_d(0));
2331
2332    if (intel->gen < 6)
2333       insn->header.destreg__conditionalmod = msg_reg_nr;
2334
2335    brw_set_urb_message(p,
2336                        insn,
2337                        allocate,
2338                        used,
2339                        msg_length,
2340                        response_length,
2341                        eot,
2342                        writes_complete,
2343                        offset,
2344                        swizzle);
2345 }
2346
2347 static int
2348 brw_find_next_block_end(struct brw_compile *p, int start)
2349 {
2350    int ip;
2351
2352    for (ip = start + 1; ip < p->nr_insn; ip++) {
2353       struct brw_instruction *insn = &p->store[ip];
2354
2355       switch (insn->header.opcode) {
2356       case BRW_OPCODE_ENDIF:
2357       case BRW_OPCODE_ELSE:
2358       case BRW_OPCODE_WHILE:
2359          return ip;
2360       }
2361    }
2362    assert(!"not reached");
2363    return start + 1;
2364 }
2365
2366 /* There is no DO instruction on gen6, so to find the end of the loop
2367  * we have to see if the loop is jumping back before our start
2368  * instruction.
2369  */
2370 static int
2371 brw_find_loop_end(struct brw_compile *p, int start)
2372 {
2373    struct intel_context *intel = &p->brw->intel;
2374    int ip;
2375    int br = 2;
2376
2377    for (ip = start + 1; ip < p->nr_insn; ip++) {
2378       struct brw_instruction *insn = &p->store[ip];
2379
2380       if (insn->header.opcode == BRW_OPCODE_WHILE) {
2381          int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2382                                    : insn->bits3.break_cont.jip;
2383          if (ip + jip / br <= start)
2384             return ip;
2385       }
2386    }
2387    assert(!"not reached");
2388    return start + 1;
2389 }
2390
2391 /* After program generation, go back and update the UIP and JIP of
2392  * BREAK and CONT instructions to their correct locations.
2393  */
2394 void
2395 brw_set_uip_jip(struct brw_compile *p)
2396 {
2397    struct intel_context *intel = &p->brw->intel;
2398    int ip;
2399    int br = 2;
2400
2401    if (intel->gen < 6)
2402       return;
2403
2404    for (ip = 0; ip < p->nr_insn; ip++) {
2405       struct brw_instruction *insn = &p->store[ip];
2406
2407       switch (insn->header.opcode) {
2408       case BRW_OPCODE_BREAK:
2409          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2410          /* Gen7 UIP points to WHILE; Gen6 points just after it */
2411          insn->bits3.break_cont.uip =
2412             br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2413          break;
2414       case BRW_OPCODE_CONTINUE:
2415          insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2416          insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2417
2418          assert(insn->bits3.break_cont.uip != 0);
2419          assert(insn->bits3.break_cont.jip != 0);
2420          break;
2421       }
2422    }
2423 }
2424
2425 void brw_ff_sync(struct brw_compile *p,
2426                    struct brw_reg dest,
2427                    GLuint msg_reg_nr,
2428                    struct brw_reg src0,
2429                    bool allocate,
2430                    GLuint response_length,
2431                    bool eot)
2432 {
2433    struct intel_context *intel = &p->brw->intel;
2434    struct brw_instruction *insn;
2435
2436    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2437
2438    insn = next_insn(p, BRW_OPCODE_SEND);
2439    brw_set_dest(p, insn, dest);
2440    brw_set_src0(p, insn, src0);
2441    brw_set_src1(p, insn, brw_imm_d(0));
2442
2443    if (intel->gen < 6)
2444       insn->header.destreg__conditionalmod = msg_reg_nr;
2445
2446    brw_set_ff_sync_message(p,
2447                            insn,
2448                            allocate,
2449                            response_length,
2450                            eot);
2451 }
2452
2453 /**
2454  * Emit the SEND instruction necessary to generate stream output data on Gen6
2455  * (for transform feedback).
2456  *
2457  * If send_commit_msg is true, this is the last piece of stream output data
2458  * from this thread, so send the data as a committed write.  According to the
2459  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2460  *
2461  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2462  *   writes are complete by sending the final write as a committed write."
2463  */
2464 void
2465 brw_svb_write(struct brw_compile *p,
2466               struct brw_reg dest,
2467               GLuint msg_reg_nr,
2468               struct brw_reg src0,
2469               GLuint binding_table_index,
2470               bool   send_commit_msg)
2471 {
2472    struct brw_instruction *insn;
2473
2474    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2475
2476    insn = next_insn(p, BRW_OPCODE_SEND);
2477    brw_set_dest(p, insn, dest);
2478    brw_set_src0(p, insn, src0);
2479    brw_set_src1(p, insn, brw_imm_d(0));
2480    brw_set_dp_write_message(p, insn,
2481                             binding_table_index,
2482                             0, /* msg_control: ignored */
2483                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2484                             1, /* msg_length */
2485                             true, /* header_present */
2486                             0, /* last_render_target: ignored */
2487                             send_commit_msg, /* response_length */
2488                             0, /* end_of_thread */
2489                             send_commit_msg); /* send_commit_msg */
2490 }