src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions into \p shader
  68        * before instruction \p inst in basic block \p block.  The default
  69        * execution controls and debug annotation are initialized from the
  70        * instruction passed as argument.
  71        */
  72       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  73          shader(shader), block(block), cursor(inst),
  74          _dispatch_width(inst->exec_size),
  75          _group(inst->force_sechalf ? 8 : 0),
  76          force_writemask_all(inst->force_writemask_all)
  77       {
  78          annotation.str = inst->annotation;
  79          annotation.ir = inst->ir;
  80       }
  81
  82       /**
  83        * Construct an fs_builder that inserts instructions before \p cursor in
  84        * basic block \p block, inheriting other code generation parameters
  85        * from this.
  86        */
  87       fs_builder
  88       at(bblock_t *block, exec_node *cursor) const
  89       {
  90          fs_builder bld = *this;
  91          bld.block = block;
  92          bld.cursor = cursor;
  93          return bld;
  94       }
  95
  96       /**
  97        * Construct an fs_builder appending instructions at the end of the
  98        * instruction list of the shader, inheriting other code generation
  99        * parameters from this.
 100        */
 101       fs_builder
 102       at_end() const
 103       {
 104          return at(NULL, (exec_node *)&shader->instructions.tail);
 105       }
 106
 107       /**
 108        * Construct a builder specifying the default SIMD width and group of
 109        * channel enable signals, inheriting other code generation parameters
 110        * from this.
 111        *
 112        * \p n gives the default SIMD width, \p i gives the slot group used for
 113        * predication and control flow masking in multiples of \p n channels.
 114        */
 115       fs_builder
 116       group(unsigned n, unsigned i) const
 117       {
 118          assert(force_writemask_all ||
 119                 (n <= dispatch_width() && i < dispatch_width() / n));
 120          fs_builder bld = *this;
 121          bld._dispatch_width = n;
 122          bld._group += i * n;
 123          return bld;
 124       }
 125
 126       /**
 127        * Alias for group() with width equal to eight.
 128        */
 129       fs_builder
 130       half(unsigned i) const
 131       {
 132          return group(8, i);
 133       }
 134
 135       /**
 136        * Construct a builder with per-channel control flow execution masking
 137        * disabled if \p b is true.  If control flow execution masking is
 138        * already disabled this has no effect.
 139        */
 140       fs_builder
 141       exec_all(bool b = true) const
 142       {
 143          fs_builder bld = *this;
 144          if (b)
 145             bld.force_writemask_all = true;
 146          return bld;
 147       }
 148
 149       /**
 150        * Construct a builder with the given debug annotation info.
 151        */
 152       fs_builder
 153       annotate(const char *str, const void *ir = NULL) const
 154       {
 155          fs_builder bld = *this;
 156          bld.annotation.str = str;
 157          bld.annotation.ir = ir;
 158          return bld;
 159       }
 160
 161       /**
 162        * Get the SIMD width in use.
 163        */
 164       unsigned
 165       dispatch_width() const
 166       {
 167          return _dispatch_width;
 168       }
 169
 170       /**
 171        * Allocate a virtual register of natural vector size (one for this IR)
 172        * and SIMD width.  \p n gives the amount of space to allocate in
 173        * dispatch_width units (which is just enough space for one logical
 174        * component in this IR).
 175        */
 176       dst_reg
 177       vgrf(enum brw_reg_type type, unsigned n = 1) const
 178       {
 179          assert(dispatch_width() <= 32);
 180
 181          if (n > 0)
 182             return dst_reg(GRF, shader->alloc.allocate(
 183                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 184                                            REG_SIZE)),
 185                            type);
 186          else
 187             return retype(null_reg_ud(), type);
 188       }
 189
 190       /**
 191        * Create a null register of floating type.
 192        */
 193       dst_reg
 194       null_reg_f() const
 195       {
 196          return dst_reg(retype(brw_null_vec(dispatch_width()),
 197                                BRW_REGISTER_TYPE_F));
 198       }
 199
 200       /**
 201        * Create a null register of signed integer type.
 202        */
 203       dst_reg
 204       null_reg_d() const
 205       {
 206          return dst_reg(retype(brw_null_vec(dispatch_width()),
 207                                BRW_REGISTER_TYPE_D));
 208       }
 209
 210       /**
 211        * Create a null register of unsigned integer type.
 212        */
 213       dst_reg
 214       null_reg_ud() const
 215       {
 216          return dst_reg(retype(brw_null_vec(dispatch_width()),
 217                                BRW_REGISTER_TYPE_UD));
 218       }
 219
 220       /**
 221        * Get the mask of SIMD channels enabled by dispatch and not yet
 222        * disabled by discard.
 223        */
 224       src_reg
 225       sample_mask_reg() const
 226       {
 227          const bool uses_kill =
 228             (shader->stage == MESA_SHADER_FRAGMENT &&
 229              ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
 230          return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
 231                  uses_kill ? brw_flag_reg(0, 1) :
 232                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
 233       }
 234
 235       /**
 236        * Insert an instruction into the program.
 237        */
 238       instruction *
 239       emit(const instruction &inst) const
 240       {
 241          return emit(new(shader->mem_ctx) instruction(inst));
 242       }
 243
 244       /**
 245        * Create and insert a nullary control instruction into the program.
 246        */
 247       instruction *
 248       emit(enum opcode opcode) const
 249       {
 250          return emit(instruction(opcode, dispatch_width()));
 251       }
 252
 253       /**
 254        * Create and insert a nullary instruction into the program.
 255        */
 256       instruction *
 257       emit(enum opcode opcode, const dst_reg &dst) const
 258       {
 259          return emit(instruction(opcode, dispatch_width(), dst));
 260       }
 261
 262       /**
 263        * Create and insert a unary instruction into the program.
 264        */
 265       instruction *
 266       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 267       {
 268          switch (opcode) {
 269          case SHADER_OPCODE_RCP:
 270          case SHADER_OPCODE_RSQ:
 271          case SHADER_OPCODE_SQRT:
 272          case SHADER_OPCODE_EXP2:
 273          case SHADER_OPCODE_LOG2:
 274          case SHADER_OPCODE_SIN:
 275          case SHADER_OPCODE_COS:
 276             return fix_math_instruction(
 277                emit(instruction(opcode, dispatch_width(), dst,
 278                                 fix_math_operand(src0))));
 279
 280          default:
 281             return emit(instruction(opcode, dispatch_width(), dst, src0));
 282          }
 283       }
 284
 285       /**
 286        * Create and insert a binary instruction into the program.
 287        */
 288       instruction *
 289       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 290            const src_reg &src1) const
 291       {
 292          switch (opcode) {
 293          case SHADER_OPCODE_POW:
 294          case SHADER_OPCODE_INT_QUOTIENT:
 295          case SHADER_OPCODE_INT_REMAINDER:
 296             return fix_math_instruction(
 297                emit(instruction(opcode, dispatch_width(), dst,
 298                                 fix_math_operand(src0),
 299                                 fix_math_operand(src1))));
 300
 301          default:
 302             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 303
 304          }
 305       }
 306
 307       /**
 308        * Create and insert a ternary instruction into the program.
 309        */
 310       instruction *
 311       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 312            const src_reg &src1, const src_reg &src2) const
 313       {
 314          switch (opcode) {
 315          case BRW_OPCODE_BFE:
 316          case BRW_OPCODE_BFI2:
 317          case BRW_OPCODE_MAD:
 318          case BRW_OPCODE_LRP:
 319             return emit(instruction(opcode, dispatch_width(), dst,
 320                                     fix_3src_operand(src0),
 321                                     fix_3src_operand(src1),
 322                                     fix_3src_operand(src2)));
 323
 324          default:
 325             return emit(instruction(opcode, dispatch_width(), dst,
 326                                     src0, src1, src2));
 327          }
 328       }
 329
 330       /**
 331        * Create and insert an instruction with a variable number of sources
 332        * into the program.
 333        */
 334       instruction *
 335       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 336            unsigned n) const
 337       {
 338          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 339       }
 340
 341       /**
 342        * Insert a preallocated instruction into the program.
 343        */
 344       instruction *
 345       emit(instruction *inst) const
 346       {
 347          assert(inst->exec_size <= 32);
 348          assert(inst->exec_size == dispatch_width() ||
 349                 force_writemask_all);
 350          assert(_group == 0 || _group == 8);
 351
 352          inst->force_sechalf = (_group == 8);
 353          inst->force_writemask_all = force_writemask_all;
 354          inst->annotation = annotation.str;
 355          inst->ir = annotation.ir;
 356
 357          if (block)
 358             static_cast<instruction *>(cursor)->insert_before(block, inst);
 359          else
 360             cursor->insert_before(inst);
 361
 362          return inst;
 363       }
 364
 365       /**
 366        * Select \p src0 if the comparison of both sources with the given
 367        * conditional mod evaluates to true, otherwise select \p src1.
 368        *
 369        * Generally useful to get the minimum or maximum of two values.
 370        */
 371       void
 372       emit_minmax(const dst_reg &dst, const src_reg &src0,
 373                   const src_reg &src1, brw_conditional_mod mod) const
 374       {
 375          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 376
 377          if (shader->devinfo->gen >= 6) {
 378             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 379                                  fix_unsigned_negate(src1)));
 380          } else {
 381             CMP(null_reg_d(), src0, src1, mod);
 382             set_predicate(BRW_PREDICATE_NORMAL,
 383                           SEL(dst, src0, src1));
 384          }
 385       }
 386
 387       /**
 388        * Copy any live channel from \p src to the first channel of the result.
 389        */
 390       src_reg
 391       emit_uniformize(const src_reg &src) const
 392       {
 393          /* FIXME: We use a vector chan_index and dst to allow constant and
 394           * copy propagration to move result all the way into the consuming
 395           * instruction (typically a surface index or sampler index for a
 396           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 397           * dispatch. Once we teach const/copy propagation about scalars we
 398           * should go back to scalar destinations here.
 399           */
 400          const fs_builder ubld = exec_all();
 401          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 402          const dst_reg dst = vgrf(src.type);
 403
 404          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 405          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 406
 407          return src_reg(component(dst, 0));
 408       }
 409
 410       /**
 411        * Assorted arithmetic ops.
 412        * @{
 413        */
 414 #define ALU1(op)                                        \
 415       instruction *                                     \
 416       op(const dst_reg &dst, const src_reg &src0) const \
 417       {                                                 \
 418          return emit(BRW_OPCODE_##op, dst, src0);       \
 419       }
 420
 421 #define ALU2(op)                                                        \
 422       instruction *                                                     \
 423       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 424       {                                                                 \
 425          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 426       }
 427
 428 #define ALU2_ACC(op)                                                    \
 429       instruction *                                                     \
 430       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 431       {                                                                 \
 432          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 433          inst->writes_accumulator = true;                               \
 434          return inst;                                                   \
 435       }
 436
 437 #define ALU3(op)                                                        \
 438       instruction *                                                     \
 439       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 440          const src_reg &src2) const                                     \
 441       {                                                                 \
 442          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 443       }
 444
 445       ALU2(ADD)
 446       ALU2_ACC(ADDC)
 447       ALU2(AND)
 448       ALU2(ASR)
 449       ALU2(AVG)
 450       ALU3(BFE)
 451       ALU2(BFI1)
 452       ALU3(BFI2)
 453       ALU1(BFREV)
 454       ALU1(CBIT)
 455       ALU2(CMPN)
 456       ALU3(CSEL)
 457       ALU2(DP2)
 458       ALU2(DP3)
 459       ALU2(DP4)
 460       ALU2(DPH)
 461       ALU1(F16TO32)
 462       ALU1(F32TO16)
 463       ALU1(FBH)
 464       ALU1(FBL)
 465       ALU1(FRC)
 466       ALU2(LINE)
 467       ALU1(LZD)
 468       ALU2(MAC)
 469       ALU2_ACC(MACH)
 470       ALU3(MAD)
 471       ALU1(MOV)
 472       ALU2(MUL)
 473       ALU1(NOT)
 474       ALU2(OR)
 475       ALU2(PLN)
 476       ALU1(RNDD)
 477       ALU1(RNDE)
 478       ALU1(RNDU)
 479       ALU1(RNDZ)
 480       ALU2(SAD2)
 481       ALU2_ACC(SADA2)
 482       ALU2(SEL)
 483       ALU2(SHL)
 484       ALU2(SHR)
 485       ALU2_ACC(SUBB)
 486       ALU2(XOR)
 487
 488 #undef ALU3
 489 #undef ALU2_ACC
 490 #undef ALU2
 491 #undef ALU1
 492       /** @} */
 493
 494       /**
 495        * CMP: Sets the low bit of the destination channels with the result
 496        * of the comparison, while the upper bits are undefined, and updates
 497        * the flag register with the packed 16 bits of the result.
 498        */
 499       instruction *
 500       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 501           brw_conditional_mod condition) const
 502       {
 503          /* Take the instruction:
 504           *
 505           * CMP null<d> src0<f> src1<f>
 506           *
 507           * Original gen4 does type conversion to the destination type
 508           * before comparison, producing garbage results for floating
 509           * point comparisons.
 510           *
 511           * The destination type doesn't matter on newer generations,
 512           * so we set the type to match src0 so we can compact the
 513           * instruction.
 514           */
 515          return set_condmod(condition,
 516                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 517                                  fix_unsigned_negate(src0),
 518                                  fix_unsigned_negate(src1)));
 519       }
 520
 521       /**
 522        * Gen4 predicated IF.
 523        */
 524       instruction *
 525       IF(brw_predicate predicate) const
 526       {
 527          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 528       }
 529
 530       /**
 531        * Emit a linear interpolation instruction.
 532        */
 533       instruction *
 534       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 535           const src_reg &a) const
 536       {
 537          if (shader->devinfo->gen >= 6) {
 538             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 539              * we need to reorder the operands.
 540              */
 541             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 542
 543          } else {
 544             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 545             const dst_reg y_times_a = vgrf(dst.type);
 546             const dst_reg one_minus_a = vgrf(dst.type);
 547             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 548
 549             MUL(y_times_a, y, a);
 550             ADD(one_minus_a, negate(a), src_reg(1.0f));
 551             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 552             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 553          }
 554       }
 555
 556       /**
 557        * Collect a number of registers in a contiguous range of registers.
 558        */
 559       instruction *
 560       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 561                    unsigned sources, unsigned header_size) const
 562       {
 563          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 564          inst->header_size = header_size;
 565          inst->regs_written = header_size +
 566                               (sources - header_size) * (dispatch_width() / 8);
 567
 568          return inst;
 569       }
 570
 571       backend_shader *shader;
 572
 573    private:
 574       /**
 575        * Workaround for negation of UD registers.  See comment in
 576        * fs_generator::generate_code() for more details.
 577        */
 578       src_reg
 579       fix_unsigned_negate(const src_reg &src) const
 580       {
 581          if (src.type == BRW_REGISTER_TYPE_UD &&
 582              src.negate) {
 583             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 584             MOV(temp, src);
 585             return src_reg(temp);
 586          } else {
 587             return src;
 588          }
 589       }
 590
 591       /**
 592        * Workaround for source register modes not supported by the ternary
 593        * instruction encoding.
 594        */
 595       src_reg
 596       fix_3src_operand(const src_reg &src) const
 597       {
 598          if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
 599             return src;
 600          } else {
 601             dst_reg expanded = vgrf(src.type);
 602             MOV(expanded, src);
 603             return expanded;
 604          }
 605       }
 606
 607       /**
 608        * Workaround for source register modes not supported by the math
 609        * instruction.
 610        */
 611       src_reg
 612       fix_math_operand(const src_reg &src) const
 613       {
 614          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 615           * might be able to do better by doing execsize = 1 math and then
 616           * expanding that result out, but we would need to be careful with
 617           * masking.
 618           *
 619           * Gen6 hardware ignores source modifiers (negate and abs) on math
 620           * instructions, so we also move to a temp to set those up.
 621           *
 622           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 623           * operands to math
 624           */
 625          if ((shader->devinfo->gen == 6 &&
 626               (src.file == IMM || src.file == UNIFORM ||
 627                src.abs || src.negate)) ||
 628              (shader->devinfo->gen == 7 && src.file == IMM)) {
 629             const dst_reg tmp = vgrf(src.type);
 630             MOV(tmp, src);
 631             return tmp;
 632          } else {
 633             return src;
 634          }
 635       }
 636
 637       /**
 638        * Workaround other weirdness of the math instruction.
 639        */
 640       instruction *
 641       fix_math_instruction(instruction *inst) const
 642       {
 643          if (shader->devinfo->gen < 6) {
 644             inst->base_mrf = 2;
 645             inst->mlen = inst->sources * dispatch_width() / 8;
 646
 647             if (inst->sources > 1) {
 648                /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 649                 * "Message Payload":
 650                 *
 651                 * "Operand0[7].  For the INT DIV functions, this operand is the
 652                 *  denominator."
 653                 *  ...
 654                 * "Operand1[7].  For the INT DIV functions, this operand is the
 655                 *  numerator."
 656                 */
 657                const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
 658                const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
 659                const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
 660
 661                inst->resize_sources(1);
 662                inst->src[0] = src0;
 663
 664                at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
 665                                    src1);
 666             }
 667          }
 668
 669          return inst;
 670       }
 671
 672       bblock_t *block;
 673       exec_node *cursor;
 674
 675       unsigned _dispatch_width;
 676       unsigned _group;
 677       bool force_writemask_all;
 678
 679       /** Debug annotation info. */
 680       struct {
 681          const char *str;
 682          const void *ir;
 683       } annotation;
 684    };
 685 }
 686
 687 #endif