src/intel/compiler/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30
  31 namespace brw {
  32    /**
  33     * Toolbox to assemble an FS IR program out of individual instructions.
  34     *
  35     * This object is meant to have an interface consistent with
  36     * brw::vec4_builder.  They cannot be fully interchangeable because
  37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  38     * vector code.
  39     */
  40    class fs_builder {
  41    public:
  42       /** Type used in this IR to represent a source of an instruction. */
  43       typedef fs_reg src_reg;
  44
  45       /** Type used in this IR to represent the destination of an instruction. */
  46       typedef fs_reg dst_reg;
  47
  48       /** Type used in this IR to represent an instruction. */
  49       typedef fs_inst instruction;
  50
  51       /**
  52        * Construct an fs_builder that inserts instructions into \p shader.
  53        * \p dispatch_width gives the native execution width of the program.
  54        */
  55       fs_builder(backend_shader *shader,
  56                  unsigned dispatch_width) :
  57          shader(shader), block(NULL), cursor(NULL),
  58          _dispatch_width(dispatch_width),
  59          _group(0),
  60          force_writemask_all(false),
  61          annotation()
  62       {
  63       }
  64
  65       /**
  66        * Construct an fs_builder that inserts instructions into \p shader
  67        * before instruction \p inst in basic block \p block.  The default
  68        * execution controls and debug annotation are initialized from the
  69        * instruction passed as argument.
  70        */
  71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  72          shader(shader), block(block), cursor(inst),
  73          _dispatch_width(inst->exec_size),
  74          _group(inst->group),
  75          force_writemask_all(inst->force_writemask_all)
  76       {
  77          annotation.str = inst->annotation;
  78          annotation.ir = inst->ir;
  79       }
  80
  81       /**
  82        * Construct an fs_builder that inserts instructions before \p cursor in
  83        * basic block \p block, inheriting other code generation parameters
  84        * from this.
  85        */
  86       fs_builder
  87       at(bblock_t *block, exec_node *cursor) const
  88       {
  89          fs_builder bld = *this;
  90          bld.block = block;
  91          bld.cursor = cursor;
  92          return bld;
  93       }
  94
  95       /**
  96        * Construct an fs_builder appending instructions at the end of the
  97        * instruction list of the shader, inheriting other code generation
  98        * parameters from this.
  99        */
 100       fs_builder
 101       at_end() const
 102       {
 103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 104       }
 105
 106       /**
 107        * Construct a builder specifying the default SIMD width and group of
 108        * channel enable signals, inheriting other code generation parameters
 109        * from this.
 110        *
 111        * \p n gives the default SIMD width, \p i gives the slot group used for
 112        * predication and control flow masking in multiples of \p n channels.
 113        */
 114       fs_builder
 115       group(unsigned n, unsigned i) const
 116       {
 117          assert(force_writemask_all ||
 118                 (n <= dispatch_width() && i < dispatch_width() / n));
 119          fs_builder bld = *this;
 120          bld._dispatch_width = n;
 121          bld._group += i * n;
 122          return bld;
 123       }
 124
 125       /**
 126        * Alias for group() with width equal to eight.
 127        */
 128       fs_builder
 129       half(unsigned i) const
 130       {
 131          return group(8, i);
 132       }
 133
 134       /**
 135        * Construct a builder with per-channel control flow execution masking
 136        * disabled if \p b is true.  If control flow execution masking is
 137        * already disabled this has no effect.
 138        */
 139       fs_builder
 140       exec_all(bool b = true) const
 141       {
 142          fs_builder bld = *this;
 143          if (b)
 144             bld.force_writemask_all = true;
 145          return bld;
 146       }
 147
 148       /**
 149        * Construct a builder with the given debug annotation info.
 150        */
 151       fs_builder
 152       annotate(const char *str, const void *ir = NULL) const
 153       {
 154          fs_builder bld = *this;
 155          bld.annotation.str = str;
 156          bld.annotation.ir = ir;
 157          return bld;
 158       }
 159
 160       /**
 161        * Get the SIMD width in use.
 162        */
 163       unsigned
 164       dispatch_width() const
 165       {
 166          return _dispatch_width;
 167       }
 168
 169       /**
 170        * Get the channel group in use.
 171        */
 172       unsigned
 173       group() const
 174       {
 175          return _group;
 176       }
 177
 178       /**
 179        * Allocate a virtual register of natural vector size (one for this IR)
 180        * and SIMD width.  \p n gives the amount of space to allocate in
 181        * dispatch_width units (which is just enough space for one logical
 182        * component in this IR).
 183        */
 184       dst_reg
 185       vgrf(enum brw_reg_type type, unsigned n = 1) const
 186       {
 187          assert(dispatch_width() <= 32);
 188
 189          if (n > 0)
 190             return dst_reg(VGRF, shader->alloc.allocate(
 191                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 192                                            REG_SIZE)),
 193                            type);
 194          else
 195             return retype(null_reg_ud(), type);
 196       }
 197
 198       /**
 199        * Create a null register of floating type.
 200        */
 201       dst_reg
 202       null_reg_f() const
 203       {
 204          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
 205       }
 206
 207       dst_reg
 208       null_reg_df() const
 209       {
 210          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
 211       }
 212
 213       /**
 214        * Create a null register of signed integer type.
 215        */
 216       dst_reg
 217       null_reg_d() const
 218       {
 219          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 220       }
 221
 222       /**
 223        * Create a null register of unsigned integer type.
 224        */
 225       dst_reg
 226       null_reg_ud() const
 227       {
 228          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 229       }
 230
 231       /**
 232        * Get the mask of SIMD channels enabled by dispatch and not yet
 233        * disabled by discard.
 234        */
 235       src_reg
 236       sample_mask_reg() const
 237       {
 238          if (shader->stage != MESA_SHADER_FRAGMENT) {
 239             return brw_imm_d(0xffffffff);
 240          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
 241             return brw_flag_reg(0, 1);
 242          } else {
 243             assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
 244             return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
 245                           BRW_REGISTER_TYPE_UD);
 246          }
 247       }
 248
 249       /**
 250        * Insert an instruction into the program.
 251        */
 252       instruction *
 253       emit(const instruction &inst) const
 254       {
 255          return emit(new(shader->mem_ctx) instruction(inst));
 256       }
 257
 258       /**
 259        * Create and insert a nullary control instruction into the program.
 260        */
 261       instruction *
 262       emit(enum opcode opcode) const
 263       {
 264          return emit(instruction(opcode, dispatch_width()));
 265       }
 266
 267       /**
 268        * Create and insert a nullary instruction into the program.
 269        */
 270       instruction *
 271       emit(enum opcode opcode, const dst_reg &dst) const
 272       {
 273          return emit(instruction(opcode, dispatch_width(), dst));
 274       }
 275
 276       /**
 277        * Create and insert a unary instruction into the program.
 278        */
 279       instruction *
 280       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 281       {
 282          switch (opcode) {
 283          case SHADER_OPCODE_RCP:
 284          case SHADER_OPCODE_RSQ:
 285          case SHADER_OPCODE_SQRT:
 286          case SHADER_OPCODE_EXP2:
 287          case SHADER_OPCODE_LOG2:
 288          case SHADER_OPCODE_SIN:
 289          case SHADER_OPCODE_COS:
 290             return emit(instruction(opcode, dispatch_width(), dst,
 291                                     fix_math_operand(src0)));
 292
 293          default:
 294             return emit(instruction(opcode, dispatch_width(), dst, src0));
 295          }
 296       }
 297
 298       /**
 299        * Create and insert a binary instruction into the program.
 300        */
 301       instruction *
 302       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 303            const src_reg &src1) const
 304       {
 305          switch (opcode) {
 306          case SHADER_OPCODE_POW:
 307          case SHADER_OPCODE_INT_QUOTIENT:
 308          case SHADER_OPCODE_INT_REMAINDER:
 309             return emit(instruction(opcode, dispatch_width(), dst,
 310                                     fix_math_operand(src0),
 311                                     fix_math_operand(src1)));
 312
 313          default:
 314             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 315
 316          }
 317       }
 318
 319       /**
 320        * Create and insert a ternary instruction into the program.
 321        */
 322       instruction *
 323       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 324            const src_reg &src1, const src_reg &src2) const
 325       {
 326          switch (opcode) {
 327          case BRW_OPCODE_BFE:
 328          case BRW_OPCODE_BFI2:
 329          case BRW_OPCODE_MAD:
 330          case BRW_OPCODE_LRP:
 331             return emit(instruction(opcode, dispatch_width(), dst,
 332                                     fix_3src_operand(src0),
 333                                     fix_3src_operand(src1),
 334                                     fix_3src_operand(src2)));
 335
 336          default:
 337             return emit(instruction(opcode, dispatch_width(), dst,
 338                                     src0, src1, src2));
 339          }
 340       }
 341
 342       /**
 343        * Create and insert an instruction with a variable number of sources
 344        * into the program.
 345        */
 346       instruction *
 347       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 348            unsigned n) const
 349       {
 350          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 351       }
 352
 353       /**
 354        * Insert a preallocated instruction into the program.
 355        */
 356       instruction *
 357       emit(instruction *inst) const
 358       {
 359          assert(inst->exec_size <= 32);
 360          assert(inst->exec_size == dispatch_width() ||
 361                 force_writemask_all);
 362
 363          inst->group = _group;
 364          inst->force_writemask_all = force_writemask_all;
 365          inst->annotation = annotation.str;
 366          inst->ir = annotation.ir;
 367
 368          if (block)
 369             static_cast<instruction *>(cursor)->insert_before(block, inst);
 370          else
 371             cursor->insert_before(inst);
 372
 373          return inst;
 374       }
 375
 376       /**
 377        * Select \p src0 if the comparison of both sources with the given
 378        * conditional mod evaluates to true, otherwise select \p src1.
 379        *
 380        * Generally useful to get the minimum or maximum of two values.
 381        */
 382       instruction *
 383       emit_minmax(const dst_reg &dst, const src_reg &src0,
 384                   const src_reg &src1, brw_conditional_mod mod) const
 385       {
 386          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 387
 388          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 389                                      fix_unsigned_negate(src1)));
 390       }
 391
 392       /**
 393        * Copy any live channel from \p src to the first channel of the result.
 394        */
 395       src_reg
 396       emit_uniformize(const src_reg &src) const
 397       {
 398          /* FIXME: We use a vector chan_index and dst to allow constant and
 399           * copy propagration to move result all the way into the consuming
 400           * instruction (typically a surface index or sampler index for a
 401           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 402           * dispatch. Once we teach const/copy propagation about scalars we
 403           * should go back to scalar destinations here.
 404           */
 405          const fs_builder ubld = exec_all();
 406          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 407          const dst_reg dst = vgrf(src.type);
 408
 409          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
 410          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 411
 412          return src_reg(component(dst, 0));
 413       }
 414
 415       void
 416       emit_scan(enum opcode opcode, const dst_reg &tmp,
 417                 unsigned cluster_size, brw_conditional_mod mod) const
 418       {
 419          assert(dispatch_width() >= 8);
 420
 421          /* The instruction splitting code isn't advanced enough to split
 422           * these so we need to handle that ourselves.
 423           */
 424          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
 425             const unsigned half_width = dispatch_width() / 2;
 426             const fs_builder ubld = exec_all().group(half_width, 0);
 427             dst_reg left = tmp;
 428             dst_reg right = horiz_offset(tmp, half_width);
 429             ubld.emit_scan(opcode, left, cluster_size, mod);
 430             ubld.emit_scan(opcode, right, cluster_size, mod);
 431             if (cluster_size > half_width) {
 432                src_reg left_comp = component(left, half_width - 1);
 433                set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
 434             }
 435             return;
 436          }
 437
 438          if (cluster_size > 1) {
 439             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
 440             dst_reg left = horiz_stride(tmp, 2);
 441             dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
 442
 443             /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
 444              *
 445              *    "When source or destination datatype is 64b or operation is
 446              *    integer DWord multiply, regioning in Align1 must follow
 447              *    these rules:
 448              *
 449              *    [...]
 450              *
 451              *    3. Source and Destination offset must be the same, except
 452              *       the case of scalar source."
 453              *
 454              * In order to work around this, we create a temporary register
 455              * and shift left over to match right.  If we have a 64-bit type,
 456              * we have to use two integer MOVs instead of a 64-bit MOV.
 457              */
 458             if (need_matching_subreg_offset(opcode, tmp.type)) {
 459                dst_reg tmp2 = vgrf(tmp.type);
 460                dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2);
 461                if (type_sz(tmp.type) > 4) {
 462                   ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0),
 463                            subscript(left, BRW_REGISTER_TYPE_D, 0));
 464                   ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1),
 465                            subscript(left, BRW_REGISTER_TYPE_D, 1));
 466                } else {
 467                   ubld.MOV(new_left, left);
 468                }
 469                left = new_left;
 470             }
 471             set_condmod(mod, ubld.emit(opcode, right, left, right));
 472          }
 473
 474          if (cluster_size > 2) {
 475             if (type_sz(tmp.type) <= 4 &&
 476                 !need_matching_subreg_offset(opcode, tmp.type)) {
 477                const fs_builder ubld =
 478                   exec_all().group(dispatch_width() / 4, 0);
 479                src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
 480
 481                dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
 482                set_condmod(mod, ubld.emit(opcode, right, left, right));
 483
 484                right = horiz_stride(horiz_offset(tmp, 3), 4);
 485                set_condmod(mod, ubld.emit(opcode, right, left, right));
 486             } else {
 487                /* For 64-bit types, we have to do things differently because
 488                 * the code above would land us with destination strides that
 489                 * the hardware can't handle.  Fortunately, we'll only be
 490                 * 8-wide in that case and it's the same number of
 491                 * instructions.
 492                 */
 493                const fs_builder ubld = exec_all().group(2, 0);
 494
 495                for (unsigned i = 0; i < dispatch_width(); i += 4) {
 496                   src_reg left = component(tmp, i + 1);
 497                   dst_reg right = horiz_offset(tmp, i + 2);
 498                   set_condmod(mod, ubld.emit(opcode, right, left, right));
 499                }
 500             }
 501          }
 502
 503          if (cluster_size > 4) {
 504             const fs_builder ubld = exec_all().group(4, 0);
 505             src_reg left = component(tmp, 3);
 506             dst_reg right = horiz_offset(tmp, 4);
 507             set_condmod(mod, ubld.emit(opcode, right, left, right));
 508
 509             if (dispatch_width() > 8) {
 510                left = component(tmp, 8 + 3);
 511                right = horiz_offset(tmp, 8 + 4);
 512                set_condmod(mod, ubld.emit(opcode, right, left, right));
 513             }
 514          }
 515
 516          if (cluster_size > 8 && dispatch_width() > 8) {
 517             const fs_builder ubld = exec_all().group(8, 0);
 518             src_reg left = component(tmp, 7);
 519             dst_reg right = horiz_offset(tmp, 8);
 520             set_condmod(mod, ubld.emit(opcode, right, left, right));
 521          }
 522       }
 523
 524       /**
 525        * Assorted arithmetic ops.
 526        * @{
 527        */
 528 #define ALU1(op)                                        \
 529       instruction *                                     \
 530       op(const dst_reg &dst, const src_reg &src0) const \
 531       {                                                 \
 532          return emit(BRW_OPCODE_##op, dst, src0);       \
 533       }
 534
 535 #define ALU2(op)                                                        \
 536       instruction *                                                     \
 537       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 538       {                                                                 \
 539          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 540       }
 541
 542 #define ALU2_ACC(op)                                                    \
 543       instruction *                                                     \
 544       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 545       {                                                                 \
 546          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 547          inst->writes_accumulator = true;                               \
 548          return inst;                                                   \
 549       }
 550
 551 #define ALU3(op)                                                        \
 552       instruction *                                                     \
 553       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 554          const src_reg &src2) const                                     \
 555       {                                                                 \
 556          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 557       }
 558
 559       ALU2(ADD)
 560       ALU2_ACC(ADDC)
 561       ALU2(AND)
 562       ALU2(ASR)
 563       ALU2(AVG)
 564       ALU3(BFE)
 565       ALU2(BFI1)
 566       ALU3(BFI2)
 567       ALU1(BFREV)
 568       ALU1(CBIT)
 569       ALU2(CMPN)
 570       ALU1(DIM)
 571       ALU2(DP2)
 572       ALU2(DP3)
 573       ALU2(DP4)
 574       ALU2(DPH)
 575       ALU1(F16TO32)
 576       ALU1(F32TO16)
 577       ALU1(FBH)
 578       ALU1(FBL)
 579       ALU1(FRC)
 580       ALU2(LINE)
 581       ALU1(LZD)
 582       ALU2(MAC)
 583       ALU2_ACC(MACH)
 584       ALU3(MAD)
 585       ALU1(MOV)
 586       ALU2(MUL)
 587       ALU1(NOT)
 588       ALU2(OR)
 589       ALU2(PLN)
 590       ALU1(RNDD)
 591       ALU1(RNDE)
 592       ALU1(RNDU)
 593       ALU1(RNDZ)
 594       ALU2(SAD2)
 595       ALU2_ACC(SADA2)
 596       ALU2(SEL)
 597       ALU2(SHL)
 598       ALU2(SHR)
 599       ALU2_ACC(SUBB)
 600       ALU2(XOR)
 601
 602 #undef ALU3
 603 #undef ALU2_ACC
 604 #undef ALU2
 605 #undef ALU1
 606       /** @} */
 607
 608       /**
 609        * CMP: Sets the low bit of the destination channels with the result
 610        * of the comparison, while the upper bits are undefined, and updates
 611        * the flag register with the packed 16 bits of the result.
 612        */
 613       instruction *
 614       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 615           brw_conditional_mod condition) const
 616       {
 617          /* Take the instruction:
 618           *
 619           * CMP null<d> src0<f> src1<f>
 620           *
 621           * Original gen4 does type conversion to the destination type
 622           * before comparison, producing garbage results for floating
 623           * point comparisons.
 624           *
 625           * The destination type doesn't matter on newer generations,
 626           * so we set the type to match src0 so we can compact the
 627           * instruction.
 628           */
 629          return set_condmod(condition,
 630                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 631                                  fix_unsigned_negate(src0),
 632                                  fix_unsigned_negate(src1)));
 633       }
 634
 635       /**
 636        * Gen4 predicated IF.
 637        */
 638       instruction *
 639       IF(brw_predicate predicate) const
 640       {
 641          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 642       }
 643
 644       /**
 645        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
 646        */
 647       instruction *
 648       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 649            const src_reg &src2, brw_conditional_mod condition) const
 650       {
 651          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
 652           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
 653           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
 654           */
 655          assert(src2.type == BRW_REGISTER_TYPE_F);
 656
 657          return set_condmod(condition,
 658                             emit(BRW_OPCODE_CSEL,
 659                                  retype(dst, BRW_REGISTER_TYPE_F),
 660                                  retype(src0, BRW_REGISTER_TYPE_F),
 661                                  retype(src1, BRW_REGISTER_TYPE_F),
 662                                  src2));
 663       }
 664
 665       /**
 666        * Emit a linear interpolation instruction.
 667        */
 668       instruction *
 669       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 670           const src_reg &a) const
 671       {
 672          if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
 673             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 674              * we need to reorder the operands.
 675              */
 676             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 677
 678          } else {
 679             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 680             const dst_reg y_times_a = vgrf(dst.type);
 681             const dst_reg one_minus_a = vgrf(dst.type);
 682             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 683
 684             MUL(y_times_a, y, a);
 685             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 686             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 687             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 688          }
 689       }
 690
 691       /**
 692        * Collect a number of registers in a contiguous range of registers.
 693        */
 694       instruction *
 695       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 696                    unsigned sources, unsigned header_size) const
 697       {
 698          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 699          inst->header_size = header_size;
 700          inst->size_written = header_size * REG_SIZE;
 701          for (unsigned i = header_size; i < sources; i++) {
 702             inst->size_written +=
 703                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
 704                      REG_SIZE);
 705          }
 706
 707          return inst;
 708       }
 709
 710       backend_shader *shader;
 711
 712    private:
 713       /**
 714        * Workaround for negation of UD registers.  See comment in
 715        * fs_generator::generate_code() for more details.
 716        */
 717       src_reg
 718       fix_unsigned_negate(const src_reg &src) const
 719       {
 720          if (src.type == BRW_REGISTER_TYPE_UD &&
 721              src.negate) {
 722             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 723             MOV(temp, src);
 724             return src_reg(temp);
 725          } else {
 726             return src;
 727          }
 728       }
 729
 730       /**
 731        * Workaround for source register modes not supported by the ternary
 732        * instruction encoding.
 733        */
 734       src_reg
 735       fix_3src_operand(const src_reg &src) const
 736       {
 737          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
 738             return src;
 739          } else {
 740             dst_reg expanded = vgrf(src.type);
 741             MOV(expanded, src);
 742             return expanded;
 743          }
 744       }
 745
 746       /**
 747        * Workaround for source register modes not supported by the math
 748        * instruction.
 749        */
 750       src_reg
 751       fix_math_operand(const src_reg &src) const
 752       {
 753          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 754           * might be able to do better by doing execsize = 1 math and then
 755           * expanding that result out, but we would need to be careful with
 756           * masking.
 757           *
 758           * Gen6 hardware ignores source modifiers (negate and abs) on math
 759           * instructions, so we also move to a temp to set those up.
 760           *
 761           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 762           * operands to math
 763           */
 764          if ((shader->devinfo->gen == 6 &&
 765               (src.file == IMM || src.file == UNIFORM ||
 766                src.abs || src.negate)) ||
 767              (shader->devinfo->gen == 7 && src.file == IMM)) {
 768             const dst_reg tmp = vgrf(src.type);
 769             MOV(tmp, src);
 770             return tmp;
 771          } else {
 772             return src;
 773          }
 774       }
 775
 776
 777       /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
 778        *
 779        *    "When source or destination datatype is 64b or operation is
 780        *    integer DWord multiply, regioning in Align1 must follow
 781        *    these rules:
 782        *
 783        *    [...]
 784        *
 785        *    3. Source and Destination offset must be the same, except
 786        *       the case of scalar source."
 787        *
 788        * This helper just detects when we're in this case.
 789        */
 790       bool
 791       need_matching_subreg_offset(enum opcode opcode,
 792                                   enum brw_reg_type type) const
 793       {
 794          if (!shader->devinfo->is_cherryview &&
 795              !gen_device_info_is_9lp(shader->devinfo))
 796             return false;
 797
 798          if (type_sz(type) > 4)
 799             return true;
 800
 801          if (opcode == BRW_OPCODE_MUL &&
 802              !brw_reg_type_is_floating_point(type))
 803             return true;
 804
 805          return false;
 806       }
 807
 808       bblock_t *block;
 809       exec_node *cursor;
 810
 811       unsigned _dispatch_width;
 812       unsigned _group;
 813       bool force_writemask_all;
 814
 815       /** Debug annotation info. */
 816       struct {
 817          const char *str;
 818          const void *ir;
 819       } annotation;
 820    };
 821 }
 822
 823 #endif