src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions into \p shader
  68        * before instruction \p inst in basic block \p block.  The default
  69        * execution controls and debug annotation are initialized from the
  70        * instruction passed as argument.
  71        */
  72       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  73          shader(shader), block(block), cursor(inst),
  74          _dispatch_width(inst->exec_size),
  75          _group(inst->group),
  76          force_writemask_all(inst->force_writemask_all)
  77       {
  78          annotation.str = inst->annotation;
  79          annotation.ir = inst->ir;
  80       }
  81
  82       /**
  83        * Construct an fs_builder that inserts instructions before \p cursor in
  84        * basic block \p block, inheriting other code generation parameters
  85        * from this.
  86        */
  87       fs_builder
  88       at(bblock_t *block, exec_node *cursor) const
  89       {
  90          fs_builder bld = *this;
  91          bld.block = block;
  92          bld.cursor = cursor;
  93          return bld;
  94       }
  95
  96       /**
  97        * Construct an fs_builder appending instructions at the end of the
  98        * instruction list of the shader, inheriting other code generation
  99        * parameters from this.
 100        */
 101       fs_builder
 102       at_end() const
 103       {
 104          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 105       }
 106
 107       /**
 108        * Construct a builder specifying the default SIMD width and group of
 109        * channel enable signals, inheriting other code generation parameters
 110        * from this.
 111        *
 112        * \p n gives the default SIMD width, \p i gives the slot group used for
 113        * predication and control flow masking in multiples of \p n channels.
 114        */
 115       fs_builder
 116       group(unsigned n, unsigned i) const
 117       {
 118          assert(force_writemask_all ||
 119                 (n <= dispatch_width() && i < dispatch_width() / n));
 120          fs_builder bld = *this;
 121          bld._dispatch_width = n;
 122          bld._group += i * n;
 123          return bld;
 124       }
 125
 126       /**
 127        * Alias for group() with width equal to eight.
 128        */
 129       fs_builder
 130       half(unsigned i) const
 131       {
 132          return group(8, i);
 133       }
 134
 135       /**
 136        * Construct a builder with per-channel control flow execution masking
 137        * disabled if \p b is true.  If control flow execution masking is
 138        * already disabled this has no effect.
 139        */
 140       fs_builder
 141       exec_all(bool b = true) const
 142       {
 143          fs_builder bld = *this;
 144          if (b)
 145             bld.force_writemask_all = true;
 146          return bld;
 147       }
 148
 149       /**
 150        * Construct a builder with the given debug annotation info.
 151        */
 152       fs_builder
 153       annotate(const char *str, const void *ir = NULL) const
 154       {
 155          fs_builder bld = *this;
 156          bld.annotation.str = str;
 157          bld.annotation.ir = ir;
 158          return bld;
 159       }
 160
 161       /**
 162        * Get the SIMD width in use.
 163        */
 164       unsigned
 165       dispatch_width() const
 166       {
 167          return _dispatch_width;
 168       }
 169
 170       /**
 171        * Get the channel group in use.
 172        */
 173       unsigned
 174       group() const
 175       {
 176          return _group;
 177       }
 178
 179       /**
 180        * Allocate a virtual register of natural vector size (one for this IR)
 181        * and SIMD width.  \p n gives the amount of space to allocate in
 182        * dispatch_width units (which is just enough space for one logical
 183        * component in this IR).
 184        */
 185       dst_reg
 186       vgrf(enum brw_reg_type type, unsigned n = 1) const
 187       {
 188          assert(dispatch_width() <= 32);
 189
 190          if (n > 0)
 191             return dst_reg(VGRF, shader->alloc.allocate(
 192                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 193                                            REG_SIZE)),
 194                            type);
 195          else
 196             return retype(null_reg_ud(), type);
 197       }
 198
 199       /**
 200        * Create a null register of floating type.
 201        */
 202       dst_reg
 203       null_reg_f() const
 204       {
 205          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
 206       }
 207
 208       dst_reg
 209       null_reg_df() const
 210       {
 211          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
 212       }
 213
 214       /**
 215        * Create a null register of signed integer type.
 216        */
 217       dst_reg
 218       null_reg_d() const
 219       {
 220          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 221       }
 222
 223       /**
 224        * Create a null register of unsigned integer type.
 225        */
 226       dst_reg
 227       null_reg_ud() const
 228       {
 229          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
 230       }
 231
 232       /**
 233        * Get the mask of SIMD channels enabled by dispatch and not yet
 234        * disabled by discard.
 235        */
 236       src_reg
 237       sample_mask_reg() const
 238       {
 239          assert(shader->stage != MESA_SHADER_FRAGMENT ||
 240                 group() + dispatch_width() <= 16);
 241          if (shader->stage != MESA_SHADER_FRAGMENT) {
 242             return brw_imm_d(0xffffffff);
 243          } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
 244             return brw_flag_reg(0, 1);
 245          } else {
 246             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
 247          }
 248       }
 249
 250       /**
 251        * Insert an instruction into the program.
 252        */
 253       instruction *
 254       emit(const instruction &inst) const
 255       {
 256          return emit(new(shader->mem_ctx) instruction(inst));
 257       }
 258
 259       /**
 260        * Create and insert a nullary control instruction into the program.
 261        */
 262       instruction *
 263       emit(enum opcode opcode) const
 264       {
 265          return emit(instruction(opcode, dispatch_width()));
 266       }
 267
 268       /**
 269        * Create and insert a nullary instruction into the program.
 270        */
 271       instruction *
 272       emit(enum opcode opcode, const dst_reg &dst) const
 273       {
 274          return emit(instruction(opcode, dispatch_width(), dst));
 275       }
 276
 277       /**
 278        * Create and insert a unary instruction into the program.
 279        */
 280       instruction *
 281       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 282       {
 283          switch (opcode) {
 284          case SHADER_OPCODE_RCP:
 285          case SHADER_OPCODE_RSQ:
 286          case SHADER_OPCODE_SQRT:
 287          case SHADER_OPCODE_EXP2:
 288          case SHADER_OPCODE_LOG2:
 289          case SHADER_OPCODE_SIN:
 290          case SHADER_OPCODE_COS:
 291             return emit(instruction(opcode, dispatch_width(), dst,
 292                                     fix_math_operand(src0)));
 293
 294          default:
 295             return emit(instruction(opcode, dispatch_width(), dst, src0));
 296          }
 297       }
 298
 299       /**
 300        * Create and insert a binary instruction into the program.
 301        */
 302       instruction *
 303       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 304            const src_reg &src1) const
 305       {
 306          switch (opcode) {
 307          case SHADER_OPCODE_POW:
 308          case SHADER_OPCODE_INT_QUOTIENT:
 309          case SHADER_OPCODE_INT_REMAINDER:
 310             return emit(instruction(opcode, dispatch_width(), dst,
 311                                     fix_math_operand(src0),
 312                                     fix_math_operand(src1)));
 313
 314          default:
 315             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 316
 317          }
 318       }
 319
 320       /**
 321        * Create and insert a ternary instruction into the program.
 322        */
 323       instruction *
 324       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 325            const src_reg &src1, const src_reg &src2) const
 326       {
 327          switch (opcode) {
 328          case BRW_OPCODE_BFE:
 329          case BRW_OPCODE_BFI2:
 330          case BRW_OPCODE_MAD:
 331          case BRW_OPCODE_LRP:
 332             return emit(instruction(opcode, dispatch_width(), dst,
 333                                     fix_3src_operand(src0),
 334                                     fix_3src_operand(src1),
 335                                     fix_3src_operand(src2)));
 336
 337          default:
 338             return emit(instruction(opcode, dispatch_width(), dst,
 339                                     src0, src1, src2));
 340          }
 341       }
 342
 343       /**
 344        * Create and insert an instruction with a variable number of sources
 345        * into the program.
 346        */
 347       instruction *
 348       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 349            unsigned n) const
 350       {
 351          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 352       }
 353
 354       /**
 355        * Insert a preallocated instruction into the program.
 356        */
 357       instruction *
 358       emit(instruction *inst) const
 359       {
 360          assert(inst->exec_size <= 32);
 361          assert(inst->exec_size == dispatch_width() ||
 362                 force_writemask_all);
 363
 364          inst->group = _group;
 365          inst->force_writemask_all = force_writemask_all;
 366          inst->annotation = annotation.str;
 367          inst->ir = annotation.ir;
 368
 369          if (block)
 370             static_cast<instruction *>(cursor)->insert_before(block, inst);
 371          else
 372             cursor->insert_before(inst);
 373
 374          return inst;
 375       }
 376
 377       /**
 378        * Select \p src0 if the comparison of both sources with the given
 379        * conditional mod evaluates to true, otherwise select \p src1.
 380        *
 381        * Generally useful to get the minimum or maximum of two values.
 382        */
 383       instruction *
 384       emit_minmax(const dst_reg &dst, const src_reg &src0,
 385                   const src_reg &src1, brw_conditional_mod mod) const
 386       {
 387          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 388
 389          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 390                                      fix_unsigned_negate(src1)));
 391       }
 392
 393       /**
 394        * Copy any live channel from \p src to the first channel of the result.
 395        */
 396       src_reg
 397       emit_uniformize(const src_reg &src) const
 398       {
 399          /* FIXME: We use a vector chan_index and dst to allow constant and
 400           * copy propagration to move result all the way into the consuming
 401           * instruction (typically a surface index or sampler index for a
 402           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 403           * dispatch. Once we teach const/copy propagation about scalars we
 404           * should go back to scalar destinations here.
 405           */
 406          const fs_builder ubld = exec_all();
 407          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 408          const dst_reg dst = vgrf(src.type);
 409
 410          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 411          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 412
 413          return src_reg(component(dst, 0));
 414       }
 415
 416       /**
 417        * Assorted arithmetic ops.
 418        * @{
 419        */
 420 #define ALU1(op)                                        \
 421       instruction *                                     \
 422       op(const dst_reg &dst, const src_reg &src0) const \
 423       {                                                 \
 424          return emit(BRW_OPCODE_##op, dst, src0);       \
 425       }
 426
 427 #define ALU2(op)                                                        \
 428       instruction *                                                     \
 429       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 430       {                                                                 \
 431          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 432       }
 433
 434 #define ALU2_ACC(op)                                                    \
 435       instruction *                                                     \
 436       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 437       {                                                                 \
 438          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 439          inst->writes_accumulator = true;                               \
 440          return inst;                                                   \
 441       }
 442
 443 #define ALU3(op)                                                        \
 444       instruction *                                                     \
 445       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 446          const src_reg &src2) const                                     \
 447       {                                                                 \
 448          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 449       }
 450
 451       ALU2(ADD)
 452       ALU2_ACC(ADDC)
 453       ALU2(AND)
 454       ALU2(ASR)
 455       ALU2(AVG)
 456       ALU3(BFE)
 457       ALU2(BFI1)
 458       ALU3(BFI2)
 459       ALU1(BFREV)
 460       ALU1(CBIT)
 461       ALU2(CMPN)
 462       ALU3(CSEL)
 463       ALU1(DIM)
 464       ALU2(DP2)
 465       ALU2(DP3)
 466       ALU2(DP4)
 467       ALU2(DPH)
 468       ALU1(F16TO32)
 469       ALU1(F32TO16)
 470       ALU1(FBH)
 471       ALU1(FBL)
 472       ALU1(FRC)
 473       ALU2(LINE)
 474       ALU1(LZD)
 475       ALU2(MAC)
 476       ALU2_ACC(MACH)
 477       ALU3(MAD)
 478       ALU1(MOV)
 479       ALU2(MUL)
 480       ALU1(NOT)
 481       ALU2(OR)
 482       ALU2(PLN)
 483       ALU1(RNDD)
 484       ALU1(RNDE)
 485       ALU1(RNDU)
 486       ALU1(RNDZ)
 487       ALU2(SAD2)
 488       ALU2_ACC(SADA2)
 489       ALU2(SEL)
 490       ALU2(SHL)
 491       ALU2(SHR)
 492       ALU2_ACC(SUBB)
 493       ALU2(XOR)
 494
 495 #undef ALU3
 496 #undef ALU2_ACC
 497 #undef ALU2
 498 #undef ALU1
 499       /** @} */
 500
 501       /**
 502        * CMP: Sets the low bit of the destination channels with the result
 503        * of the comparison, while the upper bits are undefined, and updates
 504        * the flag register with the packed 16 bits of the result.
 505        */
 506       instruction *
 507       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 508           brw_conditional_mod condition) const
 509       {
 510          /* Take the instruction:
 511           *
 512           * CMP null<d> src0<f> src1<f>
 513           *
 514           * Original gen4 does type conversion to the destination type
 515           * before comparison, producing garbage results for floating
 516           * point comparisons.
 517           *
 518           * The destination type doesn't matter on newer generations,
 519           * so we set the type to match src0 so we can compact the
 520           * instruction.
 521           */
 522          return set_condmod(condition,
 523                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 524                                  fix_unsigned_negate(src0),
 525                                  fix_unsigned_negate(src1)));
 526       }
 527
 528       /**
 529        * Gen4 predicated IF.
 530        */
 531       instruction *
 532       IF(brw_predicate predicate) const
 533       {
 534          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 535       }
 536
 537       /**
 538        * Emit a linear interpolation instruction.
 539        */
 540       instruction *
 541       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 542           const src_reg &a) const
 543       {
 544          if (shader->devinfo->gen >= 6) {
 545             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 546              * we need to reorder the operands.
 547              */
 548             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 549
 550          } else {
 551             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 552             const dst_reg y_times_a = vgrf(dst.type);
 553             const dst_reg one_minus_a = vgrf(dst.type);
 554             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 555
 556             MUL(y_times_a, y, a);
 557             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 558             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 559             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 560          }
 561       }
 562
 563       /**
 564        * Collect a number of registers in a contiguous range of registers.
 565        */
 566       instruction *
 567       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 568                    unsigned sources, unsigned header_size) const
 569       {
 570          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 571          inst->header_size = header_size;
 572          inst->size_written = header_size * REG_SIZE;
 573          for (unsigned i = header_size; i < sources; i++) {
 574             inst->size_written +=
 575                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
 576                      REG_SIZE);
 577          }
 578
 579          return inst;
 580       }
 581
 582       backend_shader *shader;
 583
 584    private:
 585       /**
 586        * Workaround for negation of UD registers.  See comment in
 587        * fs_generator::generate_code() for more details.
 588        */
 589       src_reg
 590       fix_unsigned_negate(const src_reg &src) const
 591       {
 592          if (src.type == BRW_REGISTER_TYPE_UD &&
 593              src.negate) {
 594             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 595             MOV(temp, src);
 596             return src_reg(temp);
 597          } else {
 598             return src;
 599          }
 600       }
 601
 602       /**
 603        * Workaround for source register modes not supported by the ternary
 604        * instruction encoding.
 605        */
 606       src_reg
 607       fix_3src_operand(const src_reg &src) const
 608       {
 609          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
 610             return src;
 611          } else {
 612             dst_reg expanded = vgrf(src.type);
 613             MOV(expanded, src);
 614             return expanded;
 615          }
 616       }
 617
 618       /**
 619        * Workaround for source register modes not supported by the math
 620        * instruction.
 621        */
 622       src_reg
 623       fix_math_operand(const src_reg &src) const
 624       {
 625          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 626           * might be able to do better by doing execsize = 1 math and then
 627           * expanding that result out, but we would need to be careful with
 628           * masking.
 629           *
 630           * Gen6 hardware ignores source modifiers (negate and abs) on math
 631           * instructions, so we also move to a temp to set those up.
 632           *
 633           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 634           * operands to math
 635           */
 636          if ((shader->devinfo->gen == 6 &&
 637               (src.file == IMM || src.file == UNIFORM ||
 638                src.abs || src.negate)) ||
 639              (shader->devinfo->gen == 7 && src.file == IMM)) {
 640             const dst_reg tmp = vgrf(src.type);
 641             MOV(tmp, src);
 642             return tmp;
 643          } else {
 644             return src;
 645          }
 646       }
 647
 648       bblock_t *block;
 649       exec_node *cursor;
 650
 651       unsigned _dispatch_width;
 652       unsigned _group;
 653       bool force_writemask_all;
 654
 655       /** Debug annotation info. */
 656       struct {
 657          const char *str;
 658          const void *ir;
 659       } annotation;
 660    };
 661 }
 662
 663 #endif