src/intel/compiler/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30
  31 namespace brw {
  32    /**
  33     * Toolbox to assemble an FS IR program out of individual instructions.
  34     *
  35     * This object is meant to have an interface consistent with
  36     * brw::vec4_builder.  They cannot be fully interchangeable because
  37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  38     * vector code.
  39     */
  40    class fs_builder {
  41    public:
  42       /** Type used in this IR to represent a source of an instruction. */
  43       typedef fs_reg src_reg;
  44
  45       /** Type used in this IR to represent the destination of an instruction. */
  46       typedef fs_reg dst_reg;
  47
  48       /** Type used in this IR to represent an instruction. */
  49       typedef fs_inst instruction;
  50
  51       /**
  52        * Construct an fs_builder that inserts instructions into \p shader.
  53        * \p dispatch_width gives the native execution width of the program.
  54        */
  55       fs_builder(backend_shader *shader,
  56                  unsigned dispatch_width) :
  57          shader(shader), block(NULL), cursor(NULL),
  58          _dispatch_width(dispatch_width),
  59          _group(0),
  60          force_writemask_all(false),
  61          annotation()
  62       {
  63       }
  64
  65       /**
  66        * Construct an fs_builder that inserts instructions into \p shader
  67        * before instruction \p inst in basic block \p block.  The default
  68        * execution controls and debug annotation are initialized from the
  69        * instruction passed as argument.
  70        */
  71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  72          shader(shader), block(block), cursor(inst),
  73          _dispatch_width(inst->exec_size),
  74          _group(inst->group),
  75          force_writemask_all(inst->force_writemask_all)
  76       {
  77          annotation.str = inst->annotation;
  78          annotation.ir = inst->ir;
  79       }
  80
  81       /**
  82        * Construct an fs_builder that inserts instructions before \p cursor in
  83        * basic block \p block, inheriting other code generation parameters
  84        * from this.
  85        */
  86       fs_builder
  87       at(bblock_t *block, exec_node *cursor) const
  88       {
  89          fs_builder bld = *this;
  90          bld.block = block;
  91          bld.cursor = cursor;
  92          return bld;
  93       }
  94
  95       /**
  96        * Construct an fs_builder appending instructions at the end of the
  97        * instruction list of the shader, inheriting other code generation
  98        * parameters from this.
  99        */
 100       fs_builder
 101       at_end() const
 102       {
 103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 104       }
 105
 106       /**
 107        * Construct a builder specifying the default SIMD width and group of
 108        * channel enable signals, inheriting other code generation parameters
 109        * from this.
 110        *
 111        * \p n gives the default SIMD width, \p i gives the slot group used for
 112        * predication and control flow masking in multiples of \p n channels.
 113        */
 114       fs_builder
 115       group(unsigned n, unsigned i) const
 116       {
 117          fs_builder bld = *this;
 118
 119          if (n <= dispatch_width() && i < dispatch_width() / n) {
 120             bld._group += i * n;
 121          } else {
 122             /* The requested channel group isn't a subset of the channel group
 123              * of this builder, which means that the resulting instructions
 124              * would use (potentially undefined) channel enable signals not
 125              * specified by the parent builder.  That's only valid if the
 126              * instruction doesn't have per-channel semantics, in which case
 127              * we should clear off the default group index in order to prevent
 128              * emitting instructions with channel group not aligned to their
 129              * own execution size.
 130              */
 131             assert(force_writemask_all);
 132             bld._group = 0;
 133          }
 134
 135          bld._dispatch_width = n;
 136          return bld;
 137       }
 138
 139       /**
 140        * Alias for group() with width equal to eight.
 141        */
 142       fs_builder
 143       half(unsigned i) const
 144       {
 145          return group(8, i);
 146       }
 147
 148       /**
 149        * Construct a builder with per-channel control flow execution masking
 150        * disabled if \p b is true.  If control flow execution masking is
 151        * already disabled this has no effect.
 152        */
 153       fs_builder
 154       exec_all(bool b = true) const
 155       {
 156          fs_builder bld = *this;
 157          if (b)
 158             bld.force_writemask_all = true;
 159          return bld;
 160       }
 161
 162       /**
 163        * Construct a builder with the given debug annotation info.
 164        */
 165       fs_builder
 166       annotate(const char *str, const void *ir = NULL) const
 167       {
 168          fs_builder bld = *this;
 169          bld.annotation.str = str;
 170          bld.annotation.ir = ir;
 171          return bld;
 172       }
 173
 174       /**
 175        * Get the SIMD width in use.
 176        */
 177       unsigned
 178       dispatch_width() const
 179       {
 180          return _dispatch_width;
 181       }
 182
 183       /**
 184        * Get the channel group in use.
 185        */
 186       unsigned
 187       group() const
 188       {
 189          return _group;
 190       }
 191
 192       /**
 193        * Allocate a virtual register of natural vector size (one for this IR)
 194        * and SIMD width.  \p n gives the amount of space to allocate in
 195        * dispatch_width units (which is just enough space for one logical
 196        * component in this IR).
 197        */
 198       dst_reg
 199       vgrf(enum brw_reg_type type, unsigned n = 1) const
 200       {
 201          assert(dispatch_width() <= 32);
 202
 203          if (n > 0)
 204             return dst_reg(VGRF, shader->alloc.allocate(
 205                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 206                                            REG_SIZE)),
 207                            type);
 208          else
 209             return retype(null_reg_ud(), type);
 210       }
 211
 212       /**
 213        * Create a null register of floating type.
 214        */
 215       dst_reg
 216       null_reg_f() const
 217       {
 218          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
 219       }
 220
 221       dst_reg
 222       null_reg_df() const
 223       {
 224          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
 225       }
 226
 227       /**
 228        * Create a null register of signed integer type.
 229        */
 230       dst_reg
 231       null_reg_d() const
 232       {
 233          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 234       }
 235
 236       /**
 237        * Create a null register of unsigned integer type.
 238        */
 239       dst_reg
 240       null_reg_ud() const
 241       {
 242          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 243       }
 244
 245       /**
 246        * Get the mask of SIMD channels enabled by dispatch and not yet
 247        * disabled by discard.
 248        */
 249       src_reg
 250       sample_mask_reg() const
 251       {
 252          if (shader->stage != MESA_SHADER_FRAGMENT) {
 253             return brw_imm_d(0xffffffff);
 254          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
 255             return brw_flag_reg(0, 1);
 256          } else {
 257             assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
 258             return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
 259                           BRW_REGISTER_TYPE_UD);
 260          }
 261       }
 262
 263       /**
 264        * Insert an instruction into the program.
 265        */
 266       instruction *
 267       emit(const instruction &inst) const
 268       {
 269          return emit(new(shader->mem_ctx) instruction(inst));
 270       }
 271
 272       /**
 273        * Create and insert a nullary control instruction into the program.
 274        */
 275       instruction *
 276       emit(enum opcode opcode) const
 277       {
 278          return emit(instruction(opcode, dispatch_width()));
 279       }
 280
 281       /**
 282        * Create and insert a nullary instruction into the program.
 283        */
 284       instruction *
 285       emit(enum opcode opcode, const dst_reg &dst) const
 286       {
 287          return emit(instruction(opcode, dispatch_width(), dst));
 288       }
 289
 290       /**
 291        * Create and insert a unary instruction into the program.
 292        */
 293       instruction *
 294       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 295       {
 296          switch (opcode) {
 297          case SHADER_OPCODE_RCP:
 298          case SHADER_OPCODE_RSQ:
 299          case SHADER_OPCODE_SQRT:
 300          case SHADER_OPCODE_EXP2:
 301          case SHADER_OPCODE_LOG2:
 302          case SHADER_OPCODE_SIN:
 303          case SHADER_OPCODE_COS:
 304             return emit(instruction(opcode, dispatch_width(), dst,
 305                                     fix_math_operand(src0)));
 306
 307          default:
 308             return emit(instruction(opcode, dispatch_width(), dst, src0));
 309          }
 310       }
 311
 312       /**
 313        * Create and insert a binary instruction into the program.
 314        */
 315       instruction *
 316       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 317            const src_reg &src1) const
 318       {
 319          switch (opcode) {
 320          case SHADER_OPCODE_POW:
 321          case SHADER_OPCODE_INT_QUOTIENT:
 322          case SHADER_OPCODE_INT_REMAINDER:
 323             return emit(instruction(opcode, dispatch_width(), dst,
 324                                     fix_math_operand(src0),
 325                                     fix_math_operand(fix_byte_src(src1))));
 326
 327          default:
 328             return emit(instruction(opcode, dispatch_width(), dst,
 329                                     src0, fix_byte_src(src1)));
 330
 331          }
 332       }
 333
 334       /**
 335        * Create and insert a ternary instruction into the program.
 336        */
 337       instruction *
 338       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 339            const src_reg &src1, const src_reg &src2) const
 340       {
 341          switch (opcode) {
 342          case BRW_OPCODE_BFE:
 343          case BRW_OPCODE_BFI2:
 344          case BRW_OPCODE_MAD:
 345          case BRW_OPCODE_LRP:
 346             return emit(instruction(opcode, dispatch_width(), dst,
 347                                     fix_3src_operand(src0),
 348                                     fix_3src_operand(fix_byte_src(src1)),
 349                                     fix_3src_operand(fix_byte_src(src2))));
 350
 351          default:
 352             return emit(instruction(opcode, dispatch_width(), dst,
 353                                     src0, fix_byte_src(src1), fix_byte_src(src2)));
 354          }
 355       }
 356
 357       /**
 358        * Create and insert an instruction with a variable number of sources
 359        * into the program.
 360        */
 361       instruction *
 362       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 363            unsigned n) const
 364       {
 365          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 366       }
 367
 368       /**
 369        * Insert a preallocated instruction into the program.
 370        */
 371       instruction *
 372       emit(instruction *inst) const
 373       {
 374          assert(inst->exec_size <= 32);
 375          assert(inst->exec_size == dispatch_width() ||
 376                 force_writemask_all);
 377
 378          inst->group = _group;
 379          inst->force_writemask_all = force_writemask_all;
 380          inst->annotation = annotation.str;
 381          inst->ir = annotation.ir;
 382
 383          if (block)
 384             static_cast<instruction *>(cursor)->insert_before(block, inst);
 385          else
 386             cursor->insert_before(inst);
 387
 388          return inst;
 389       }
 390
 391       /**
 392        * Select \p src0 if the comparison of both sources with the given
 393        * conditional mod evaluates to true, otherwise select \p src1.
 394        *
 395        * Generally useful to get the minimum or maximum of two values.
 396        */
 397       instruction *
 398       emit_minmax(const dst_reg &dst, const src_reg &src0,
 399                   const src_reg &src1, brw_conditional_mod mod) const
 400       {
 401          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 402
 403          /* In some cases we can't have bytes as operand for src1, so use the
 404           * same type for both operand.
 405           */
 406          return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
 407                                      fix_unsigned_negate(fix_byte_src(src1))));
 408       }
 409
 410       /**
 411        * Copy any live channel from \p src to the first channel of the result.
 412        */
 413       src_reg
 414       emit_uniformize(const src_reg &src) const
 415       {
 416          /* FIXME: We use a vector chan_index and dst to allow constant and
 417           * copy propagration to move result all the way into the consuming
 418           * instruction (typically a surface index or sampler index for a
 419           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 420           * dispatch. Once we teach const/copy propagation about scalars we
 421           * should go back to scalar destinations here.
 422           */
 423          const fs_builder ubld = exec_all();
 424          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 425          const dst_reg dst = vgrf(src.type);
 426
 427          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
 428          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 429
 430          return src_reg(component(dst, 0));
 431       }
 432
 433       src_reg
 434       move_to_vgrf(const src_reg &src, unsigned num_components) const
 435       {
 436          src_reg *const src_comps = new src_reg[num_components];
 437          for (unsigned i = 0; i < num_components; i++)
 438             src_comps[i] = offset(src, dispatch_width(), i);
 439
 440          const dst_reg dst = vgrf(src.type, num_components);
 441          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
 442
 443          delete[] src_comps;
 444
 445          return src_reg(dst);
 446       }
 447
 448       void
 449       emit_scan(enum opcode opcode, const dst_reg &tmp,
 450                 unsigned cluster_size, brw_conditional_mod mod) const
 451       {
 452          assert(dispatch_width() >= 8);
 453
 454          /* The instruction splitting code isn't advanced enough to split
 455           * these so we need to handle that ourselves.
 456           */
 457          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
 458             const unsigned half_width = dispatch_width() / 2;
 459             const fs_builder ubld = exec_all().group(half_width, 0);
 460             dst_reg left = tmp;
 461             dst_reg right = horiz_offset(tmp, half_width);
 462             ubld.emit_scan(opcode, left, cluster_size, mod);
 463             ubld.emit_scan(opcode, right, cluster_size, mod);
 464             if (cluster_size > half_width) {
 465                src_reg left_comp = component(left, half_width - 1);
 466                set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
 467             }
 468             return;
 469          }
 470
 471          if (cluster_size > 1) {
 472             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
 473             const dst_reg left = horiz_stride(tmp, 2);
 474             const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
 475             set_condmod(mod, ubld.emit(opcode, right, left, right));
 476          }
 477
 478          if (cluster_size > 2) {
 479             if (type_sz(tmp.type) <= 4) {
 480                const fs_builder ubld =
 481                   exec_all().group(dispatch_width() / 4, 0);
 482                src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
 483
 484                dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
 485                set_condmod(mod, ubld.emit(opcode, right, left, right));
 486
 487                right = horiz_stride(horiz_offset(tmp, 3), 4);
 488                set_condmod(mod, ubld.emit(opcode, right, left, right));
 489             } else {
 490                /* For 64-bit types, we have to do things differently because
 491                 * the code above would land us with destination strides that
 492                 * the hardware can't handle.  Fortunately, we'll only be
 493                 * 8-wide in that case and it's the same number of
 494                 * instructions.
 495                 */
 496                const fs_builder ubld = exec_all().group(2, 0);
 497
 498                for (unsigned i = 0; i < dispatch_width(); i += 4) {
 499                   src_reg left = component(tmp, i + 1);
 500                   dst_reg right = horiz_offset(tmp, i + 2);
 501                   set_condmod(mod, ubld.emit(opcode, right, left, right));
 502                }
 503             }
 504          }
 505
 506          for (unsigned i = 4;
 507               i < MIN2(cluster_size, dispatch_width());
 508               i *= 2) {
 509             const fs_builder ubld = exec_all().group(i, 0);
 510             src_reg left = component(tmp, i - 1);
 511             dst_reg right = horiz_offset(tmp, i);
 512             set_condmod(mod, ubld.emit(opcode, right, left, right));
 513
 514             if (dispatch_width() > i * 2) {
 515                left = component(tmp, i * 3 - 1);
 516                right = horiz_offset(tmp, i * 3);
 517                set_condmod(mod, ubld.emit(opcode, right, left, right));
 518             }
 519
 520             if (dispatch_width() > i * 4) {
 521                left = component(tmp, i * 5 - 1);
 522                right = horiz_offset(tmp, i * 5);
 523                set_condmod(mod, ubld.emit(opcode, right, left, right));
 524
 525                left = component(tmp, i * 7 - 1);
 526                right = horiz_offset(tmp, i * 7);
 527                set_condmod(mod, ubld.emit(opcode, right, left, right));
 528             }
 529          }
 530       }
 531
 532       /**
 533        * Assorted arithmetic ops.
 534        * @{
 535        */
 536 #define ALU1(op)                                        \
 537       instruction *                                     \
 538       op(const dst_reg &dst, const src_reg &src0) const \
 539       {                                                 \
 540          return emit(BRW_OPCODE_##op, dst, src0);       \
 541       }
 542
 543 #define ALU2(op)                                                        \
 544       instruction *                                                     \
 545       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 546       {                                                                 \
 547          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 548       }
 549
 550 #define ALU2_ACC(op)                                                    \
 551       instruction *                                                     \
 552       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 553       {                                                                 \
 554          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 555          inst->writes_accumulator = true;                               \
 556          return inst;                                                   \
 557       }
 558
 559 #define ALU3(op)                                                        \
 560       instruction *                                                     \
 561       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 562          const src_reg &src2) const                                     \
 563       {                                                                 \
 564          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 565       }
 566
 567       ALU2(ADD)
 568       ALU2_ACC(ADDC)
 569       ALU2(AND)
 570       ALU2(ASR)
 571       ALU2(AVG)
 572       ALU3(BFE)
 573       ALU2(BFI1)
 574       ALU3(BFI2)
 575       ALU1(BFREV)
 576       ALU1(CBIT)
 577       ALU2(CMPN)
 578       ALU1(DIM)
 579       ALU2(DP2)
 580       ALU2(DP3)
 581       ALU2(DP4)
 582       ALU2(DPH)
 583       ALU1(F16TO32)
 584       ALU1(F32TO16)
 585       ALU1(FBH)
 586       ALU1(FBL)
 587       ALU1(FRC)
 588       ALU2(LINE)
 589       ALU1(LZD)
 590       ALU2(MAC)
 591       ALU2_ACC(MACH)
 592       ALU3(MAD)
 593       ALU1(MOV)
 594       ALU2(MUL)
 595       ALU1(NOT)
 596       ALU2(OR)
 597       ALU2(PLN)
 598       ALU1(RNDD)
 599       ALU1(RNDE)
 600       ALU1(RNDU)
 601       ALU1(RNDZ)
 602       ALU2(ROL)
 603       ALU2(ROR)
 604       ALU2(SAD2)
 605       ALU2_ACC(SADA2)
 606       ALU2(SEL)
 607       ALU2(SHL)
 608       ALU2(SHR)
 609       ALU2_ACC(SUBB)
 610       ALU2(XOR)
 611
 612 #undef ALU3
 613 #undef ALU2_ACC
 614 #undef ALU2
 615 #undef ALU1
 616       /** @} */
 617
 618       /**
 619        * CMP: Sets the low bit of the destination channels with the result
 620        * of the comparison, while the upper bits are undefined, and updates
 621        * the flag register with the packed 16 bits of the result.
 622        */
 623       instruction *
 624       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 625           brw_conditional_mod condition) const
 626       {
 627          /* Take the instruction:
 628           *
 629           * CMP null<d> src0<f> src1<f>
 630           *
 631           * Original gen4 does type conversion to the destination type
 632           * before comparison, producing garbage results for floating
 633           * point comparisons.
 634           *
 635           * The destination type doesn't matter on newer generations,
 636           * so we set the type to match src0 so we can compact the
 637           * instruction.
 638           */
 639          return set_condmod(condition,
 640                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 641                                  fix_unsigned_negate(src0),
 642                                  fix_unsigned_negate(src1)));
 643       }
 644
 645       /**
 646        * Gen4 predicated IF.
 647        */
 648       instruction *
 649       IF(brw_predicate predicate) const
 650       {
 651          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 652       }
 653
 654       /**
 655        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
 656        */
 657       instruction *
 658       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 659            const src_reg &src2, brw_conditional_mod condition) const
 660       {
 661          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
 662           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
 663           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
 664           */
 665          assert(src2.type == BRW_REGISTER_TYPE_F);
 666
 667          return set_condmod(condition,
 668                             emit(BRW_OPCODE_CSEL,
 669                                  retype(dst, BRW_REGISTER_TYPE_F),
 670                                  retype(src0, BRW_REGISTER_TYPE_F),
 671                                  retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
 672                                  fix_byte_src(src2)));
 673       }
 674
 675       /**
 676        * Emit a linear interpolation instruction.
 677        */
 678       instruction *
 679       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 680           const src_reg &a) const
 681       {
 682          if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
 683             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 684              * we need to reorder the operands.
 685              */
 686             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 687
 688          } else {
 689             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 690             const dst_reg y_times_a = vgrf(dst.type);
 691             const dst_reg one_minus_a = vgrf(dst.type);
 692             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 693
 694             MUL(y_times_a, y, a);
 695             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 696             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 697             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 698          }
 699       }
 700
 701       /**
 702        * Collect a number of registers in a contiguous range of registers.
 703        */
 704       instruction *
 705       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 706                    unsigned sources, unsigned header_size) const
 707       {
 708          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 709          inst->header_size = header_size;
 710          inst->size_written = header_size * REG_SIZE;
 711          for (unsigned i = header_size; i < sources; i++) {
 712             inst->size_written +=
 713                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
 714                      REG_SIZE);
 715          }
 716
 717          return inst;
 718       }
 719
 720       instruction *
 721       UNDEF(const dst_reg &dst) const
 722       {
 723          assert(dst.file == VGRF);
 724          instruction *inst = emit(SHADER_OPCODE_UNDEF,
 725                                   retype(dst, BRW_REGISTER_TYPE_UD));
 726          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
 727
 728          return inst;
 729       }
 730
 731       backend_shader *shader;
 732
 733       /**
 734        * Byte sized operands are not supported for src1 on Gen11+.
 735        */
 736       src_reg
 737       fix_byte_src(const src_reg &src) const
 738       {
 739          if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
 740             return src;
 741
 742          dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
 743                              BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
 744          MOV(temp, src);
 745          return src_reg(temp);
 746       }
 747
 748    private:
 749       /**
 750        * Workaround for negation of UD registers.  See comment in
 751        * fs_generator::generate_code() for more details.
 752        */
 753       src_reg
 754       fix_unsigned_negate(const src_reg &src) const
 755       {
 756          if (src.type == BRW_REGISTER_TYPE_UD &&
 757              src.negate) {
 758             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 759             MOV(temp, src);
 760             return src_reg(temp);
 761          } else {
 762             return src;
 763          }
 764       }
 765
 766       /**
 767        * Workaround for source register modes not supported by the ternary
 768        * instruction encoding.
 769        */
 770       src_reg
 771       fix_3src_operand(const src_reg &src) const
 772       {
 773          switch (src.file) {
 774          case FIXED_GRF:
 775             /* FINISHME: Could handle scalar region, other stride=1 regions */
 776             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
 777                 src.width != BRW_WIDTH_8 ||
 778                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
 779                break;
 780             /* fallthrough */
 781          case ATTR:
 782          case VGRF:
 783          case UNIFORM:
 784          case IMM:
 785             return src;
 786          default:
 787             break;
 788          }
 789
 790          dst_reg expanded = vgrf(src.type);
 791          MOV(expanded, src);
 792          return expanded;
 793       }
 794
 795       /**
 796        * Workaround for source register modes not supported by the math
 797        * instruction.
 798        */
 799       src_reg
 800       fix_math_operand(const src_reg &src) const
 801       {
 802          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 803           * might be able to do better by doing execsize = 1 math and then
 804           * expanding that result out, but we would need to be careful with
 805           * masking.
 806           *
 807           * Gen6 hardware ignores source modifiers (negate and abs) on math
 808           * instructions, so we also move to a temp to set those up.
 809           *
 810           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 811           * operands to math
 812           */
 813          if ((shader->devinfo->gen == 6 &&
 814               (src.file == IMM || src.file == UNIFORM ||
 815                src.abs || src.negate)) ||
 816              (shader->devinfo->gen == 7 && src.file == IMM)) {
 817             const dst_reg tmp = vgrf(src.type);
 818             MOV(tmp, src);
 819             return tmp;
 820          } else {
 821             return src;
 822          }
 823       }
 824
 825       bblock_t *block;
 826       exec_node *cursor;
 827
 828       unsigned _dispatch_width;
 829       unsigned _group;
 830       bool force_writemask_all;
 831
 832       /** Debug annotation info. */
 833       struct {
 834          const char *str;
 835          const void *ir;
 836       } annotation;
 837    };
 838 }
 839
 840 #endif