src/intel/compiler/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30
  31 namespace brw {
  32    /**
  33     * Toolbox to assemble an FS IR program out of individual instructions.
  34     *
  35     * This object is meant to have an interface consistent with
  36     * brw::vec4_builder.  They cannot be fully interchangeable because
  37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  38     * vector code.
  39     */
  40    class fs_builder {
  41    public:
  42       /** Type used in this IR to represent a source of an instruction. */
  43       typedef fs_reg src_reg;
  44
  45       /** Type used in this IR to represent the destination of an instruction. */
  46       typedef fs_reg dst_reg;
  47
  48       /** Type used in this IR to represent an instruction. */
  49       typedef fs_inst instruction;
  50
  51       /**
  52        * Construct an fs_builder that inserts instructions into \p shader.
  53        * \p dispatch_width gives the native execution width of the program.
  54        */
  55       fs_builder(backend_shader *shader,
  56                  unsigned dispatch_width) :
  57          shader(shader), block(NULL), cursor(NULL),
  58          _dispatch_width(dispatch_width),
  59          _group(0),
  60          force_writemask_all(false),
  61          annotation()
  62       {
  63       }
  64
  65       /**
  66        * Construct an fs_builder that inserts instructions into \p shader
  67        * before instruction \p inst in basic block \p block.  The default
  68        * execution controls and debug annotation are initialized from the
  69        * instruction passed as argument.
  70        */
  71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  72          shader(shader), block(block), cursor(inst),
  73          _dispatch_width(inst->exec_size),
  74          _group(inst->group),
  75          force_writemask_all(inst->force_writemask_all)
  76       {
  77          annotation.str = inst->annotation;
  78          annotation.ir = inst->ir;
  79       }
  80
  81       /**
  82        * Construct an fs_builder that inserts instructions before \p cursor in
  83        * basic block \p block, inheriting other code generation parameters
  84        * from this.
  85        */
  86       fs_builder
  87       at(bblock_t *block, exec_node *cursor) const
  88       {
  89          fs_builder bld = *this;
  90          bld.block = block;
  91          bld.cursor = cursor;
  92          return bld;
  93       }
  94
  95       /**
  96        * Construct an fs_builder appending instructions at the end of the
  97        * instruction list of the shader, inheriting other code generation
  98        * parameters from this.
  99        */
 100       fs_builder
 101       at_end() const
 102       {
 103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 104       }
 105
 106       /**
 107        * Construct a builder specifying the default SIMD width and group of
 108        * channel enable signals, inheriting other code generation parameters
 109        * from this.
 110        *
 111        * \p n gives the default SIMD width, \p i gives the slot group used for
 112        * predication and control flow masking in multiples of \p n channels.
 113        */
 114       fs_builder
 115       group(unsigned n, unsigned i) const
 116       {
 117          fs_builder bld = *this;
 118
 119          if (n <= dispatch_width() && i < dispatch_width() / n) {
 120             bld._group += i * n;
 121          } else {
 122             /* The requested channel group isn't a subset of the channel group
 123              * of this builder, which means that the resulting instructions
 124              * would use (potentially undefined) channel enable signals not
 125              * specified by the parent builder.  That's only valid if the
 126              * instruction doesn't have per-channel semantics, in which case
 127              * we should clear off the default group index in order to prevent
 128              * emitting instructions with channel group not aligned to their
 129              * own execution size.
 130              */
 131             assert(force_writemask_all);
 132             bld._group = 0;
 133          }
 134
 135          bld._dispatch_width = n;
 136          return bld;
 137       }
 138
 139       /**
 140        * Alias for group() with width equal to eight.
 141        */
 142       fs_builder
 143       half(unsigned i) const
 144       {
 145          return group(8, i);
 146       }
 147
 148       /**
 149        * Construct a builder with per-channel control flow execution masking
 150        * disabled if \p b is true.  If control flow execution masking is
 151        * already disabled this has no effect.
 152        */
 153       fs_builder
 154       exec_all(bool b = true) const
 155       {
 156          fs_builder bld = *this;
 157          if (b)
 158             bld.force_writemask_all = true;
 159          return bld;
 160       }
 161
 162       /**
 163        * Construct a builder with the given debug annotation info.
 164        */
 165       fs_builder
 166       annotate(const char *str, const void *ir = NULL) const
 167       {
 168          fs_builder bld = *this;
 169          bld.annotation.str = str;
 170          bld.annotation.ir = ir;
 171          return bld;
 172       }
 173
 174       /**
 175        * Get the SIMD width in use.
 176        */
 177       unsigned
 178       dispatch_width() const
 179       {
 180          return _dispatch_width;
 181       }
 182
 183       /**
 184        * Get the channel group in use.
 185        */
 186       unsigned
 187       group() const
 188       {
 189          return _group;
 190       }
 191
 192       /**
 193        * Allocate a virtual register of natural vector size (one for this IR)
 194        * and SIMD width.  \p n gives the amount of space to allocate in
 195        * dispatch_width units (which is just enough space for one logical
 196        * component in this IR).
 197        */
 198       dst_reg
 199       vgrf(enum brw_reg_type type, unsigned n = 1) const
 200       {
 201          assert(dispatch_width() <= 32);
 202
 203          if (n > 0)
 204             return dst_reg(VGRF, shader->alloc.allocate(
 205                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 206                                            REG_SIZE)),
 207                            type);
 208          else
 209             return retype(null_reg_ud(), type);
 210       }
 211
 212       /**
 213        * Create a null register of floating type.
 214        */
 215       dst_reg
 216       null_reg_f() const
 217       {
 218          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
 219       }
 220
 221       dst_reg
 222       null_reg_df() const
 223       {
 224          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
 225       }
 226
 227       /**
 228        * Create a null register of signed integer type.
 229        */
 230       dst_reg
 231       null_reg_d() const
 232       {
 233          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 234       }
 235
 236       /**
 237        * Create a null register of unsigned integer type.
 238        */
 239       dst_reg
 240       null_reg_ud() const
 241       {
 242          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 243       }
 244
 245       /**
 246        * Insert an instruction into the program.
 247        */
 248       instruction *
 249       emit(const instruction &inst) const
 250       {
 251          return emit(new(shader->mem_ctx) instruction(inst));
 252       }
 253
 254       /**
 255        * Create and insert a nullary control instruction into the program.
 256        */
 257       instruction *
 258       emit(enum opcode opcode) const
 259       {
 260          return emit(instruction(opcode, dispatch_width()));
 261       }
 262
 263       /**
 264        * Create and insert a nullary instruction into the program.
 265        */
 266       instruction *
 267       emit(enum opcode opcode, const dst_reg &dst) const
 268       {
 269          return emit(instruction(opcode, dispatch_width(), dst));
 270       }
 271
 272       /**
 273        * Create and insert a unary instruction into the program.
 274        */
 275       instruction *
 276       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 277       {
 278          switch (opcode) {
 279          case SHADER_OPCODE_RCP:
 280          case SHADER_OPCODE_RSQ:
 281          case SHADER_OPCODE_SQRT:
 282          case SHADER_OPCODE_EXP2:
 283          case SHADER_OPCODE_LOG2:
 284          case SHADER_OPCODE_SIN:
 285          case SHADER_OPCODE_COS:
 286             return emit(instruction(opcode, dispatch_width(), dst,
 287                                     fix_math_operand(src0)));
 288
 289          default:
 290             return emit(instruction(opcode, dispatch_width(), dst, src0));
 291          }
 292       }
 293
 294       /**
 295        * Create and insert a binary instruction into the program.
 296        */
 297       instruction *
 298       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 299            const src_reg &src1) const
 300       {
 301          switch (opcode) {
 302          case SHADER_OPCODE_POW:
 303          case SHADER_OPCODE_INT_QUOTIENT:
 304          case SHADER_OPCODE_INT_REMAINDER:
 305             return emit(instruction(opcode, dispatch_width(), dst,
 306                                     fix_math_operand(src0),
 307                                     fix_math_operand(fix_byte_src(src1))));
 308
 309          default:
 310             return emit(instruction(opcode, dispatch_width(), dst,
 311                                     src0, fix_byte_src(src1)));
 312
 313          }
 314       }
 315
 316       /**
 317        * Create and insert a ternary instruction into the program.
 318        */
 319       instruction *
 320       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 321            const src_reg &src1, const src_reg &src2) const
 322       {
 323          switch (opcode) {
 324          case BRW_OPCODE_BFE:
 325          case BRW_OPCODE_BFI2:
 326          case BRW_OPCODE_MAD:
 327          case BRW_OPCODE_LRP:
 328             return emit(instruction(opcode, dispatch_width(), dst,
 329                                     fix_3src_operand(src0),
 330                                     fix_3src_operand(fix_byte_src(src1)),
 331                                     fix_3src_operand(fix_byte_src(src2))));
 332
 333          default:
 334             return emit(instruction(opcode, dispatch_width(), dst,
 335                                     src0, fix_byte_src(src1), fix_byte_src(src2)));
 336          }
 337       }
 338
 339       /**
 340        * Create and insert an instruction with a variable number of sources
 341        * into the program.
 342        */
 343       instruction *
 344       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 345            unsigned n) const
 346       {
 347          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 348       }
 349
 350       /**
 351        * Insert a preallocated instruction into the program.
 352        */
 353       instruction *
 354       emit(instruction *inst) const
 355       {
 356          assert(inst->exec_size <= 32);
 357          assert(inst->exec_size == dispatch_width() ||
 358                 force_writemask_all);
 359
 360          inst->group = _group;
 361          inst->force_writemask_all = force_writemask_all;
 362          inst->annotation = annotation.str;
 363          inst->ir = annotation.ir;
 364
 365          if (block)
 366             static_cast<instruction *>(cursor)->insert_before(block, inst);
 367          else
 368             cursor->insert_before(inst);
 369
 370          return inst;
 371       }
 372
 373       /**
 374        * Select \p src0 if the comparison of both sources with the given
 375        * conditional mod evaluates to true, otherwise select \p src1.
 376        *
 377        * Generally useful to get the minimum or maximum of two values.
 378        */
 379       instruction *
 380       emit_minmax(const dst_reg &dst, const src_reg &src0,
 381                   const src_reg &src1, brw_conditional_mod mod) const
 382       {
 383          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 384
 385          /* In some cases we can't have bytes as operand for src1, so use the
 386           * same type for both operand.
 387           */
 388          return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
 389                                      fix_unsigned_negate(fix_byte_src(src1))));
 390       }
 391
 392       /**
 393        * Copy any live channel from \p src to the first channel of the result.
 394        */
 395       src_reg
 396       emit_uniformize(const src_reg &src) const
 397       {
 398          /* FIXME: We use a vector chan_index and dst to allow constant and
 399           * copy propagration to move result all the way into the consuming
 400           * instruction (typically a surface index or sampler index for a
 401           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 402           * dispatch. Once we teach const/copy propagation about scalars we
 403           * should go back to scalar destinations here.
 404           */
 405          const fs_builder ubld = exec_all();
 406          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 407          const dst_reg dst = vgrf(src.type);
 408
 409          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 410          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 411
 412          return src_reg(component(dst, 0));
 413       }
 414
 415       src_reg
 416       move_to_vgrf(const src_reg &src, unsigned num_components) const
 417       {
 418          src_reg *const src_comps = new src_reg[num_components];
 419          for (unsigned i = 0; i < num_components; i++)
 420             src_comps[i] = offset(src, dispatch_width(), i);
 421
 422          const dst_reg dst = vgrf(src.type, num_components);
 423          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
 424
 425          delete[] src_comps;
 426
 427          return src_reg(dst);
 428       }
 429
 430       void
 431       emit_scan(enum opcode opcode, const dst_reg &tmp,
 432                 unsigned cluster_size, brw_conditional_mod mod) const
 433       {
 434          assert(dispatch_width() >= 8);
 435
 436          /* The instruction splitting code isn't advanced enough to split
 437           * these so we need to handle that ourselves.
 438           */
 439          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
 440             const unsigned half_width = dispatch_width() / 2;
 441             const fs_builder ubld = exec_all().group(half_width, 0);
 442             dst_reg left = tmp;
 443             dst_reg right = horiz_offset(tmp, half_width);
 444             ubld.emit_scan(opcode, left, cluster_size, mod);
 445             ubld.emit_scan(opcode, right, cluster_size, mod);
 446             if (cluster_size > half_width) {
 447                src_reg left_comp = component(left, half_width - 1);
 448                set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
 449             }
 450             return;
 451          }
 452
 453          if (cluster_size > 1) {
 454             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
 455             const dst_reg left = horiz_stride(tmp, 2);
 456             const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
 457             set_condmod(mod, ubld.emit(opcode, right, left, right));
 458          }
 459
 460          if (cluster_size > 2) {
 461             if (type_sz(tmp.type) <= 4) {
 462                const fs_builder ubld =
 463                   exec_all().group(dispatch_width() / 4, 0);
 464                src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
 465
 466                dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
 467                set_condmod(mod, ubld.emit(opcode, right, left, right));
 468
 469                right = horiz_stride(horiz_offset(tmp, 3), 4);
 470                set_condmod(mod, ubld.emit(opcode, right, left, right));
 471             } else {
 472                /* For 64-bit types, we have to do things differently because
 473                 * the code above would land us with destination strides that
 474                 * the hardware can't handle.  Fortunately, we'll only be
 475                 * 8-wide in that case and it's the same number of
 476                 * instructions.
 477                 */
 478                const fs_builder ubld = exec_all().group(2, 0);
 479
 480                for (unsigned i = 0; i < dispatch_width(); i += 4) {
 481                   src_reg left = component(tmp, i + 1);
 482                   dst_reg right = horiz_offset(tmp, i + 2);
 483                   set_condmod(mod, ubld.emit(opcode, right, left, right));
 484                }
 485             }
 486          }
 487
 488          for (unsigned i = 4;
 489               i < MIN2(cluster_size, dispatch_width());
 490               i *= 2) {
 491             const fs_builder ubld = exec_all().group(i, 0);
 492             src_reg left = component(tmp, i - 1);
 493             dst_reg right = horiz_offset(tmp, i);
 494             set_condmod(mod, ubld.emit(opcode, right, left, right));
 495
 496             if (dispatch_width() > i * 2) {
 497                left = component(tmp, i * 3 - 1);
 498                right = horiz_offset(tmp, i * 3);
 499                set_condmod(mod, ubld.emit(opcode, right, left, right));
 500             }
 501
 502             if (dispatch_width() > i * 4) {
 503                left = component(tmp, i * 5 - 1);
 504                right = horiz_offset(tmp, i * 5);
 505                set_condmod(mod, ubld.emit(opcode, right, left, right));
 506
 507                left = component(tmp, i * 7 - 1);
 508                right = horiz_offset(tmp, i * 7);
 509                set_condmod(mod, ubld.emit(opcode, right, left, right));
 510             }
 511          }
 512       }
 513
 514       /**
 515        * Assorted arithmetic ops.
 516        * @{
 517        */
 518 #define ALU1(op)                                        \
 519       instruction *                                     \
 520       op(const dst_reg &dst, const src_reg &src0) const \
 521       {                                                 \
 522          return emit(BRW_OPCODE_##op, dst, src0);       \
 523       }
 524
 525 #define ALU2(op)                                                        \
 526       instruction *                                                     \
 527       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 528       {                                                                 \
 529          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 530       }
 531
 532 #define ALU2_ACC(op)                                                    \
 533       instruction *                                                     \
 534       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 535       {                                                                 \
 536          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 537          inst->writes_accumulator = true;                               \
 538          return inst;                                                   \
 539       }
 540
 541 #define ALU3(op)                                                        \
 542       instruction *                                                     \
 543       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 544          const src_reg &src2) const                                     \
 545       {                                                                 \
 546          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 547       }
 548
 549       ALU2(ADD)
 550       ALU2_ACC(ADDC)
 551       ALU2(AND)
 552       ALU2(ASR)
 553       ALU2(AVG)
 554       ALU3(BFE)
 555       ALU2(BFI1)
 556       ALU3(BFI2)
 557       ALU1(BFREV)
 558       ALU1(CBIT)
 559       ALU2(CMPN)
 560       ALU1(DIM)
 561       ALU2(DP2)
 562       ALU2(DP3)
 563       ALU2(DP4)
 564       ALU2(DPH)
 565       ALU1(F16TO32)
 566       ALU1(F32TO16)
 567       ALU1(FBH)
 568       ALU1(FBL)
 569       ALU1(FRC)
 570       ALU2(LINE)
 571       ALU1(LZD)
 572       ALU2(MAC)
 573       ALU2_ACC(MACH)
 574       ALU3(MAD)
 575       ALU1(MOV)
 576       ALU2(MUL)
 577       ALU1(NOT)
 578       ALU2(OR)
 579       ALU2(PLN)
 580       ALU1(RNDD)
 581       ALU1(RNDE)
 582       ALU1(RNDU)
 583       ALU1(RNDZ)
 584       ALU2(ROL)
 585       ALU2(ROR)
 586       ALU2(SAD2)
 587       ALU2_ACC(SADA2)
 588       ALU2(SEL)
 589       ALU2(SHL)
 590       ALU2(SHR)
 591       ALU2_ACC(SUBB)
 592       ALU2(XOR)
 593
 594 #undef ALU3
 595 #undef ALU2_ACC
 596 #undef ALU2
 597 #undef ALU1
 598       /** @} */
 599
 600       /**
 601        * CMP: Sets the low bit of the destination channels with the result
 602        * of the comparison, while the upper bits are undefined, and updates
 603        * the flag register with the packed 16 bits of the result.
 604        */
 605       instruction *
 606       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 607           brw_conditional_mod condition) const
 608       {
 609          /* Take the instruction:
 610           *
 611           * CMP null<d> src0<f> src1<f>
 612           *
 613           * Original gen4 does type conversion to the destination type
 614           * before comparison, producing garbage results for floating
 615           * point comparisons.
 616           *
 617           * The destination type doesn't matter on newer generations,
 618           * so we set the type to match src0 so we can compact the
 619           * instruction.
 620           */
 621          return set_condmod(condition,
 622                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 623                                  fix_unsigned_negate(src0),
 624                                  fix_unsigned_negate(src1)));
 625       }
 626
 627       /**
 628        * Gen4 predicated IF.
 629        */
 630       instruction *
 631       IF(brw_predicate predicate) const
 632       {
 633          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 634       }
 635
 636       /**
 637        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
 638        */
 639       instruction *
 640       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 641            const src_reg &src2, brw_conditional_mod condition) const
 642       {
 643          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
 644           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
 645           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
 646           */
 647          assert(src2.type == BRW_REGISTER_TYPE_F);
 648
 649          return set_condmod(condition,
 650                             emit(BRW_OPCODE_CSEL,
 651                                  retype(dst, BRW_REGISTER_TYPE_F),
 652                                  retype(src0, BRW_REGISTER_TYPE_F),
 653                                  retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
 654                                  fix_byte_src(src2)));
 655       }
 656
 657       /**
 658        * Emit a linear interpolation instruction.
 659        */
 660       instruction *
 661       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 662           const src_reg &a) const
 663       {
 664          if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
 665             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 666              * we need to reorder the operands.
 667              */
 668             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 669
 670          } else {
 671             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 672             const dst_reg y_times_a = vgrf(dst.type);
 673             const dst_reg one_minus_a = vgrf(dst.type);
 674             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 675
 676             MUL(y_times_a, y, a);
 677             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 678             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 679             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 680          }
 681       }
 682
 683       /**
 684        * Collect a number of registers in a contiguous range of registers.
 685        */
 686       instruction *
 687       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 688                    unsigned sources, unsigned header_size) const
 689       {
 690          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 691          inst->header_size = header_size;
 692          inst->size_written = header_size * REG_SIZE;
 693          for (unsigned i = header_size; i < sources; i++) {
 694             inst->size_written +=
 695                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
 696                      REG_SIZE);
 697          }
 698
 699          return inst;
 700       }
 701
 702       instruction *
 703       UNDEF(const dst_reg &dst) const
 704       {
 705          assert(dst.file == VGRF);
 706          instruction *inst = emit(SHADER_OPCODE_UNDEF,
 707                                   retype(dst, BRW_REGISTER_TYPE_UD));
 708          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
 709
 710          return inst;
 711       }
 712
 713       backend_shader *shader;
 714
 715       /**
 716        * Byte sized operands are not supported for src1 on Gen11+.
 717        */
 718       src_reg
 719       fix_byte_src(const src_reg &src) const
 720       {
 721          if (shader->devinfo->gen < 11 || type_sz(src.type) != 1)
 722             return src;
 723
 724          dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
 725                              BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
 726          MOV(temp, src);
 727          return src_reg(temp);
 728       }
 729
 730    private:
 731       /**
 732        * Workaround for negation of UD registers.  See comment in
 733        * fs_generator::generate_code() for more details.
 734        */
 735       src_reg
 736       fix_unsigned_negate(const src_reg &src) const
 737       {
 738          if (src.type == BRW_REGISTER_TYPE_UD &&
 739              src.negate) {
 740             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 741             MOV(temp, src);
 742             return src_reg(temp);
 743          } else {
 744             return src;
 745          }
 746       }
 747
 748       /**
 749        * Workaround for source register modes not supported by the ternary
 750        * instruction encoding.
 751        */
 752       src_reg
 753       fix_3src_operand(const src_reg &src) const
 754       {
 755          switch (src.file) {
 756          case FIXED_GRF:
 757             /* FINISHME: Could handle scalar region, other stride=1 regions */
 758             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
 759                 src.width != BRW_WIDTH_8 ||
 760                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
 761                break;
 762             /* fallthrough */
 763          case ATTR:
 764          case VGRF:
 765          case UNIFORM:
 766          case IMM:
 767             return src;
 768          default:
 769             break;
 770          }
 771
 772          dst_reg expanded = vgrf(src.type);
 773          MOV(expanded, src);
 774          return expanded;
 775       }
 776
 777       /**
 778        * Workaround for source register modes not supported by the math
 779        * instruction.
 780        */
 781       src_reg
 782       fix_math_operand(const src_reg &src) const
 783       {
 784          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 785           * might be able to do better by doing execsize = 1 math and then
 786           * expanding that result out, but we would need to be careful with
 787           * masking.
 788           *
 789           * Gen6 hardware ignores source modifiers (negate and abs) on math
 790           * instructions, so we also move to a temp to set those up.
 791           *
 792           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 793           * operands to math
 794           */
 795          if ((shader->devinfo->gen == 6 &&
 796               (src.file == IMM || src.file == UNIFORM ||
 797                src.abs || src.negate)) ||
 798              (shader->devinfo->gen == 7 && src.file == IMM)) {
 799             const dst_reg tmp = vgrf(src.type);
 800             MOV(tmp, src);
 801             return tmp;
 802          } else {
 803             return src;
 804          }
 805       }
 806
 807       bblock_t *block;
 808       exec_node *cursor;
 809
 810       unsigned _dispatch_width;
 811       unsigned _group;
 812       bool force_writemask_all;
 813
 814       /** Debug annotation info. */
 815       struct {
 816          const char *str;
 817          const void *ir;
 818       } annotation;
 819    };
 820 }
 821
 822 #endif