src/intel/compiler/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30
  31 namespace brw {
  32    /**
  33     * Toolbox to assemble an FS IR program out of individual instructions.
  34     *
  35     * This object is meant to have an interface consistent with
  36     * brw::vec4_builder.  They cannot be fully interchangeable because
  37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  38     * vector code.
  39     */
  40    class fs_builder {
  41    public:
  42       /** Type used in this IR to represent a source of an instruction. */
  43       typedef fs_reg src_reg;
  44
  45       /** Type used in this IR to represent the destination of an instruction. */
  46       typedef fs_reg dst_reg;
  47
  48       /** Type used in this IR to represent an instruction. */
  49       typedef fs_inst instruction;
  50
  51       /**
  52        * Construct an fs_builder that inserts instructions into \p shader.
  53        * \p dispatch_width gives the native execution width of the program.
  54        */
  55       fs_builder(backend_shader *shader,
  56                  unsigned dispatch_width) :
  57          shader(shader), block(NULL), cursor(NULL),
  58          _dispatch_width(dispatch_width),
  59          _group(0),
  60          force_writemask_all(false),
  61          annotation()
  62       {
  63       }
  64
  65       /**
  66        * Construct an fs_builder that inserts instructions into \p shader
  67        * before instruction \p inst in basic block \p block.  The default
  68        * execution controls and debug annotation are initialized from the
  69        * instruction passed as argument.
  70        */
  71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  72          shader(shader), block(block), cursor(inst),
  73          _dispatch_width(inst->exec_size),
  74          _group(inst->group),
  75          force_writemask_all(inst->force_writemask_all)
  76       {
  77          annotation.str = inst->annotation;
  78          annotation.ir = inst->ir;
  79       }
  80
  81       /**
  82        * Construct an fs_builder that inserts instructions before \p cursor in
  83        * basic block \p block, inheriting other code generation parameters
  84        * from this.
  85        */
  86       fs_builder
  87       at(bblock_t *block, exec_node *cursor) const
  88       {
  89          fs_builder bld = *this;
  90          bld.block = block;
  91          bld.cursor = cursor;
  92          return bld;
  93       }
  94
  95       /**
  96        * Construct an fs_builder appending instructions at the end of the
  97        * instruction list of the shader, inheriting other code generation
  98        * parameters from this.
  99        */
 100       fs_builder
 101       at_end() const
 102       {
 103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 104       }
 105
 106       /**
 107        * Construct a builder specifying the default SIMD width and group of
 108        * channel enable signals, inheriting other code generation parameters
 109        * from this.
 110        *
 111        * \p n gives the default SIMD width, \p i gives the slot group used for
 112        * predication and control flow masking in multiples of \p n channels.
 113        */
 114       fs_builder
 115       group(unsigned n, unsigned i) const
 116       {
 117          assert(force_writemask_all ||
 118                 (n <= dispatch_width() && i < dispatch_width() / n));
 119          fs_builder bld = *this;
 120          bld._dispatch_width = n;
 121          bld._group += i * n;
 122          return bld;
 123       }
 124
 125       /**
 126        * Alias for group() with width equal to eight.
 127        */
 128       fs_builder
 129       half(unsigned i) const
 130       {
 131          return group(8, i);
 132       }
 133
 134       /**
 135        * Construct a builder with per-channel control flow execution masking
 136        * disabled if \p b is true.  If control flow execution masking is
 137        * already disabled this has no effect.
 138        */
 139       fs_builder
 140       exec_all(bool b = true) const
 141       {
 142          fs_builder bld = *this;
 143          if (b)
 144             bld.force_writemask_all = true;
 145          return bld;
 146       }
 147
 148       /**
 149        * Construct a builder with the given debug annotation info.
 150        */
 151       fs_builder
 152       annotate(const char *str, const void *ir = NULL) const
 153       {
 154          fs_builder bld = *this;
 155          bld.annotation.str = str;
 156          bld.annotation.ir = ir;
 157          return bld;
 158       }
 159
 160       /**
 161        * Get the SIMD width in use.
 162        */
 163       unsigned
 164       dispatch_width() const
 165       {
 166          return _dispatch_width;
 167       }
 168
 169       /**
 170        * Get the channel group in use.
 171        */
 172       unsigned
 173       group() const
 174       {
 175          return _group;
 176       }
 177
 178       /**
 179        * Allocate a virtual register of natural vector size (one for this IR)
 180        * and SIMD width.  \p n gives the amount of space to allocate in
 181        * dispatch_width units (which is just enough space for one logical
 182        * component in this IR).
 183        */
 184       dst_reg
 185       vgrf(enum brw_reg_type type, unsigned n = 1) const
 186       {
 187          assert(dispatch_width() <= 32);
 188
 189          if (n > 0)
 190             return dst_reg(VGRF, shader->alloc.allocate(
 191                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 192                                            REG_SIZE)),
 193                            type);
 194          else
 195             return retype(null_reg_ud(), type);
 196       }
 197
 198       /**
 199        * Create a null register of floating type.
 200        */
 201       dst_reg
 202       null_reg_f() const
 203       {
 204          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
 205       }
 206
 207       dst_reg
 208       null_reg_df() const
 209       {
 210          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
 211       }
 212
 213       /**
 214        * Create a null register of signed integer type.
 215        */
 216       dst_reg
 217       null_reg_d() const
 218       {
 219          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 220       }
 221
 222       /**
 223        * Create a null register of unsigned integer type.
 224        */
 225       dst_reg
 226       null_reg_ud() const
 227       {
 228          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 229       }
 230
 231       /**
 232        * Get the mask of SIMD channels enabled by dispatch and not yet
 233        * disabled by discard.
 234        */
 235       src_reg
 236       sample_mask_reg() const
 237       {
 238          assert(shader->stage != MESA_SHADER_FRAGMENT ||
 239                 group() + dispatch_width() <= 16);
 240          if (shader->stage != MESA_SHADER_FRAGMENT) {
 241             return brw_imm_d(0xffffffff);
 242          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
 243             return brw_flag_reg(0, 1);
 244          } else {
 245             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
 246          }
 247       }
 248
 249       /**
 250        * Insert an instruction into the program.
 251        */
 252       instruction *
 253       emit(const instruction &inst) const
 254       {
 255          return emit(new(shader->mem_ctx) instruction(inst));
 256       }
 257
 258       /**
 259        * Create and insert a nullary control instruction into the program.
 260        */
 261       instruction *
 262       emit(enum opcode opcode) const
 263       {
 264          return emit(instruction(opcode, dispatch_width()));
 265       }
 266
 267       /**
 268        * Create and insert a nullary instruction into the program.
 269        */
 270       instruction *
 271       emit(enum opcode opcode, const dst_reg &dst) const
 272       {
 273          return emit(instruction(opcode, dispatch_width(), dst));
 274       }
 275
 276       /**
 277        * Create and insert a unary instruction into the program.
 278        */
 279       instruction *
 280       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 281       {
 282          switch (opcode) {
 283          case SHADER_OPCODE_RCP:
 284          case SHADER_OPCODE_RSQ:
 285          case SHADER_OPCODE_SQRT:
 286          case SHADER_OPCODE_EXP2:
 287          case SHADER_OPCODE_LOG2:
 288          case SHADER_OPCODE_SIN:
 289          case SHADER_OPCODE_COS:
 290             return emit(instruction(opcode, dispatch_width(), dst,
 291                                     fix_math_operand(src0)));
 292
 293          default:
 294             return emit(instruction(opcode, dispatch_width(), dst, src0));
 295          }
 296       }
 297
 298       /**
 299        * Create and insert a binary instruction into the program.
 300        */
 301       instruction *
 302       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 303            const src_reg &src1) const
 304       {
 305          switch (opcode) {
 306          case SHADER_OPCODE_POW:
 307          case SHADER_OPCODE_INT_QUOTIENT:
 308          case SHADER_OPCODE_INT_REMAINDER:
 309             return emit(instruction(opcode, dispatch_width(), dst,
 310                                     fix_math_operand(src0),
 311                                     fix_math_operand(src1)));
 312
 313          default:
 314             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 315
 316          }
 317       }
 318
 319       /**
 320        * Create and insert a ternary instruction into the program.
 321        */
 322       instruction *
 323       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 324            const src_reg &src1, const src_reg &src2) const
 325       {
 326          switch (opcode) {
 327          case BRW_OPCODE_BFE:
 328          case BRW_OPCODE_BFI2:
 329          case BRW_OPCODE_MAD:
 330          case BRW_OPCODE_LRP:
 331             return emit(instruction(opcode, dispatch_width(), dst,
 332                                     fix_3src_operand(src0),
 333                                     fix_3src_operand(src1),
 334                                     fix_3src_operand(src2)));
 335
 336          default:
 337             return emit(instruction(opcode, dispatch_width(), dst,
 338                                     src0, src1, src2));
 339          }
 340       }
 341
 342       /**
 343        * Create and insert an instruction with a variable number of sources
 344        * into the program.
 345        */
 346       instruction *
 347       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 348            unsigned n) const
 349       {
 350          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 351       }
 352
 353       /**
 354        * Insert a preallocated instruction into the program.
 355        */
 356       instruction *
 357       emit(instruction *inst) const
 358       {
 359          assert(inst->exec_size <= 32);
 360          assert(inst->exec_size == dispatch_width() ||
 361                 force_writemask_all);
 362
 363          inst->group = _group;
 364          inst->force_writemask_all = force_writemask_all;
 365          inst->annotation = annotation.str;
 366          inst->ir = annotation.ir;
 367
 368          if (block)
 369             static_cast<instruction *>(cursor)->insert_before(block, inst);
 370          else
 371             cursor->insert_before(inst);
 372
 373          return inst;
 374       }
 375
 376       /**
 377        * Select \p src0 if the comparison of both sources with the given
 378        * conditional mod evaluates to true, otherwise select \p src1.
 379        *
 380        * Generally useful to get the minimum or maximum of two values.
 381        */
 382       instruction *
 383       emit_minmax(const dst_reg &dst, const src_reg &src0,
 384                   const src_reg &src1, brw_conditional_mod mod) const
 385       {
 386          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 387
 388          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 389                                      fix_unsigned_negate(src1)));
 390       }
 391
 392       /**
 393        * Copy any live channel from \p src to the first channel of the result.
 394        */
 395       src_reg
 396       emit_uniformize(const src_reg &src) const
 397       {
 398          /* FIXME: We use a vector chan_index and dst to allow constant and
 399           * copy propagration to move result all the way into the consuming
 400           * instruction (typically a surface index or sampler index for a
 401           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 402           * dispatch. Once we teach const/copy propagation about scalars we
 403           * should go back to scalar destinations here.
 404           */
 405          const fs_builder ubld = exec_all();
 406          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 407          const dst_reg dst = vgrf(src.type);
 408
 409          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 410          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 411
 412          return src_reg(component(dst, 0));
 413       }
 414
 415       /**
 416        * Assorted arithmetic ops.
 417        * @{
 418        */
 419 #define ALU1(op)                                        \
 420       instruction *                                     \
 421       op(const dst_reg &dst, const src_reg &src0) const \
 422       {                                                 \
 423          return emit(BRW_OPCODE_##op, dst, src0);       \
 424       }
 425
 426 #define ALU2(op)                                                        \
 427       instruction *                                                     \
 428       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 429       {                                                                 \
 430          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 431       }
 432
 433 #define ALU2_ACC(op)                                                    \
 434       instruction *                                                     \
 435       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 436       {                                                                 \
 437          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 438          inst->writes_accumulator = true;                               \
 439          return inst;                                                   \
 440       }
 441
 442 #define ALU3(op)                                                        \
 443       instruction *                                                     \
 444       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 445          const src_reg &src2) const                                     \
 446       {                                                                 \
 447          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 448       }
 449
 450       ALU2(ADD)
 451       ALU2_ACC(ADDC)
 452       ALU2(AND)
 453       ALU2(ASR)
 454       ALU2(AVG)
 455       ALU3(BFE)
 456       ALU2(BFI1)
 457       ALU3(BFI2)
 458       ALU1(BFREV)
 459       ALU1(CBIT)
 460       ALU2(CMPN)
 461       ALU3(CSEL)
 462       ALU1(DIM)
 463       ALU2(DP2)
 464       ALU2(DP3)
 465       ALU2(DP4)
 466       ALU2(DPH)
 467       ALU1(F16TO32)
 468       ALU1(F32TO16)
 469       ALU1(FBH)
 470       ALU1(FBL)
 471       ALU1(FRC)
 472       ALU2(LINE)
 473       ALU1(LZD)
 474       ALU2(MAC)
 475       ALU2_ACC(MACH)
 476       ALU3(MAD)
 477       ALU1(MOV)
 478       ALU2(MUL)
 479       ALU1(NOT)
 480       ALU2(OR)
 481       ALU2(PLN)
 482       ALU1(RNDD)
 483       ALU1(RNDE)
 484       ALU1(RNDU)
 485       ALU1(RNDZ)
 486       ALU2(SAD2)
 487       ALU2_ACC(SADA2)
 488       ALU2(SEL)
 489       ALU2(SHL)
 490       ALU2(SHR)
 491       ALU2_ACC(SUBB)
 492       ALU2(XOR)
 493
 494 #undef ALU3
 495 #undef ALU2_ACC
 496 #undef ALU2
 497 #undef ALU1
 498       /** @} */
 499
 500       /**
 501        * CMP: Sets the low bit of the destination channels with the result
 502        * of the comparison, while the upper bits are undefined, and updates
 503        * the flag register with the packed 16 bits of the result.
 504        */
 505       instruction *
 506       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 507           brw_conditional_mod condition) const
 508       {
 509          /* Take the instruction:
 510           *
 511           * CMP null<d> src0<f> src1<f>
 512           *
 513           * Original gen4 does type conversion to the destination type
 514           * before comparison, producing garbage results for floating
 515           * point comparisons.
 516           *
 517           * The destination type doesn't matter on newer generations,
 518           * so we set the type to match src0 so we can compact the
 519           * instruction.
 520           */
 521          return set_condmod(condition,
 522                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 523                                  fix_unsigned_negate(src0),
 524                                  fix_unsigned_negate(src1)));
 525       }
 526
 527       /**
 528        * Gen4 predicated IF.
 529        */
 530       instruction *
 531       IF(brw_predicate predicate) const
 532       {
 533          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 534       }
 535
 536       /**
 537        * Emit a linear interpolation instruction.
 538        */
 539       instruction *
 540       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 541           const src_reg &a) const
 542       {
 543          if (shader->devinfo->gen >= 6) {
 544             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 545              * we need to reorder the operands.
 546              */
 547             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 548
 549          } else {
 550             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 551             const dst_reg y_times_a = vgrf(dst.type);
 552             const dst_reg one_minus_a = vgrf(dst.type);
 553             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 554
 555             MUL(y_times_a, y, a);
 556             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 557             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 558             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 559          }
 560       }
 561
 562       /**
 563        * Collect a number of registers in a contiguous range of registers.
 564        */
 565       instruction *
 566       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 567                    unsigned sources, unsigned header_size) const
 568       {
 569          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 570          inst->header_size = header_size;
 571          inst->size_written = header_size * REG_SIZE;
 572          for (unsigned i = header_size; i < sources; i++) {
 573             inst->size_written +=
 574                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
 575                      REG_SIZE);
 576          }
 577
 578          return inst;
 579       }
 580
 581       backend_shader *shader;
 582
 583    private:
 584       /**
 585        * Workaround for negation of UD registers.  See comment in
 586        * fs_generator::generate_code() for more details.
 587        */
 588       src_reg
 589       fix_unsigned_negate(const src_reg &src) const
 590       {
 591          if (src.type == BRW_REGISTER_TYPE_UD &&
 592              src.negate) {
 593             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 594             MOV(temp, src);
 595             return src_reg(temp);
 596          } else {
 597             return src;
 598          }
 599       }
 600
 601       /**
 602        * Workaround for source register modes not supported by the ternary
 603        * instruction encoding.
 604        */
 605       src_reg
 606       fix_3src_operand(const src_reg &src) const
 607       {
 608          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
 609             return src;
 610          } else {
 611             dst_reg expanded = vgrf(src.type);
 612             MOV(expanded, src);
 613             return expanded;
 614          }
 615       }
 616
 617       /**
 618        * Workaround for source register modes not supported by the math
 619        * instruction.
 620        */
 621       src_reg
 622       fix_math_operand(const src_reg &src) const
 623       {
 624          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 625           * might be able to do better by doing execsize = 1 math and then
 626           * expanding that result out, but we would need to be careful with
 627           * masking.
 628           *
 629           * Gen6 hardware ignores source modifiers (negate and abs) on math
 630           * instructions, so we also move to a temp to set those up.
 631           *
 632           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 633           * operands to math
 634           */
 635          if ((shader->devinfo->gen == 6 &&
 636               (src.file == IMM || src.file == UNIFORM ||
 637                src.abs || src.negate)) ||
 638              (shader->devinfo->gen == 7 && src.file == IMM)) {
 639             const dst_reg tmp = vgrf(src.type);
 640             MOV(tmp, src);
 641             return tmp;
 642          } else {
 643             return src;
 644          }
 645       }
 646
 647       bblock_t *block;
 648       exec_node *cursor;
 649
 650       unsigned _dispatch_width;
 651       unsigned _group;
 652       bool force_writemask_all;
 653
 654       /** Debug annotation info. */
 655       struct {
 656          const char *str;
 657          const void *ir;
 658       } annotation;
 659    };
 660 }
 661
 662 #endif