src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions into \p shader
  68        * before instruction \p inst in basic block \p block.  The default
  69        * execution controls and debug annotation are initialized from the
  70        * instruction passed as argument.
  71        */
  72       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
  73          shader(shader), block(block), cursor(inst),
  74          _dispatch_width(inst->exec_size),
  75          _group(inst->force_sechalf ? 8 : 0),
  76          force_writemask_all(inst->force_writemask_all)
  77       {
  78          annotation.str = inst->annotation;
  79          annotation.ir = inst->ir;
  80       }
  81
  82       /**
  83        * Construct an fs_builder that inserts instructions before \p cursor in
  84        * basic block \p block, inheriting other code generation parameters
  85        * from this.
  86        */
  87       fs_builder
  88       at(bblock_t *block, exec_node *cursor) const
  89       {
  90          fs_builder bld = *this;
  91          bld.block = block;
  92          bld.cursor = cursor;
  93          return bld;
  94       }
  95
  96       /**
  97        * Construct an fs_builder appending instructions at the end of the
  98        * instruction list of the shader, inheriting other code generation
  99        * parameters from this.
 100        */
 101       fs_builder
 102       at_end() const
 103       {
 104          return at(NULL, (exec_node *)&shader->instructions.tail);
 105       }
 106
 107       /**
 108        * Construct a builder specifying the default SIMD width and group of
 109        * channel enable signals, inheriting other code generation parameters
 110        * from this.
 111        *
 112        * \p n gives the default SIMD width, \p i gives the slot group used for
 113        * predication and control flow masking in multiples of \p n channels.
 114        */
 115       fs_builder
 116       group(unsigned n, unsigned i) const
 117       {
 118          assert(force_writemask_all ||
 119                 (n <= dispatch_width() && i < dispatch_width() / n));
 120          fs_builder bld = *this;
 121          bld._dispatch_width = n;
 122          bld._group += i * n;
 123          return bld;
 124       }
 125
 126       /**
 127        * Alias for group() with width equal to eight.
 128        */
 129       fs_builder
 130       half(unsigned i) const
 131       {
 132          return group(8, i);
 133       }
 134
 135       /**
 136        * Construct a builder with per-channel control flow execution masking
 137        * disabled if \p b is true.  If control flow execution masking is
 138        * already disabled this has no effect.
 139        */
 140       fs_builder
 141       exec_all(bool b = true) const
 142       {
 143          fs_builder bld = *this;
 144          if (b)
 145             bld.force_writemask_all = true;
 146          return bld;
 147       }
 148
 149       /**
 150        * Construct a builder with the given debug annotation info.
 151        */
 152       fs_builder
 153       annotate(const char *str, const void *ir = NULL) const
 154       {
 155          fs_builder bld = *this;
 156          bld.annotation.str = str;
 157          bld.annotation.ir = ir;
 158          return bld;
 159       }
 160
 161       /**
 162        * Get the SIMD width in use.
 163        */
 164       unsigned
 165       dispatch_width() const
 166       {
 167          return _dispatch_width;
 168       }
 169
 170       /**
 171        * Allocate a virtual register of natural vector size (one for this IR)
 172        * and SIMD width.  \p n gives the amount of space to allocate in
 173        * dispatch_width units (which is just enough space for one logical
 174        * component in this IR).
 175        */
 176       dst_reg
 177       vgrf(enum brw_reg_type type, unsigned n = 1) const
 178       {
 179          assert(dispatch_width() <= 32);
 180
 181          if (n > 0)
 182             return dst_reg(VGRF, shader->alloc.allocate(
 183                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 184                                            REG_SIZE)),
 185                            type);
 186          else
 187             return retype(null_reg_ud(), type);
 188       }
 189
 190       /**
 191        * Create a null register of floating type.
 192        */
 193       dst_reg
 194       null_reg_f() const
 195       {
 196          return dst_reg(retype(brw_null_vec(dispatch_width()),
 197                                BRW_REGISTER_TYPE_F));
 198       }
 199
 200       /**
 201        * Create a null register of signed integer type.
 202        */
 203       dst_reg
 204       null_reg_d() const
 205       {
 206          return dst_reg(retype(brw_null_vec(dispatch_width()),
 207                                BRW_REGISTER_TYPE_D));
 208       }
 209
 210       /**
 211        * Create a null register of unsigned integer type.
 212        */
 213       dst_reg
 214       null_reg_ud() const
 215       {
 216          return dst_reg(retype(brw_null_vec(dispatch_width()),
 217                                BRW_REGISTER_TYPE_UD));
 218       }
 219
 220       /**
 221        * Get the mask of SIMD channels enabled by dispatch and not yet
 222        * disabled by discard.
 223        */
 224       src_reg
 225       sample_mask_reg() const
 226       {
 227          if (shader->stage != MESA_SHADER_FRAGMENT) {
 228             return brw_imm_d(0xffff);
 229          } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
 230             return brw_flag_reg(0, 1);
 231          } else {
 232             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
 233          }
 234       }
 235
 236       /**
 237        * Insert an instruction into the program.
 238        */
 239       instruction *
 240       emit(const instruction &inst) const
 241       {
 242          return emit(new(shader->mem_ctx) instruction(inst));
 243       }
 244
 245       /**
 246        * Create and insert a nullary control instruction into the program.
 247        */
 248       instruction *
 249       emit(enum opcode opcode) const
 250       {
 251          return emit(instruction(opcode, dispatch_width()));
 252       }
 253
 254       /**
 255        * Create and insert a nullary instruction into the program.
 256        */
 257       instruction *
 258       emit(enum opcode opcode, const dst_reg &dst) const
 259       {
 260          return emit(instruction(opcode, dispatch_width(), dst));
 261       }
 262
 263       /**
 264        * Create and insert a unary instruction into the program.
 265        */
 266       instruction *
 267       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 268       {
 269          switch (opcode) {
 270          case SHADER_OPCODE_RCP:
 271          case SHADER_OPCODE_RSQ:
 272          case SHADER_OPCODE_SQRT:
 273          case SHADER_OPCODE_EXP2:
 274          case SHADER_OPCODE_LOG2:
 275          case SHADER_OPCODE_SIN:
 276          case SHADER_OPCODE_COS:
 277             return fix_math_instruction(
 278                emit(instruction(opcode, dispatch_width(), dst,
 279                                 fix_math_operand(src0))));
 280
 281          default:
 282             return emit(instruction(opcode, dispatch_width(), dst, src0));
 283          }
 284       }
 285
 286       /**
 287        * Create and insert a binary instruction into the program.
 288        */
 289       instruction *
 290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 291            const src_reg &src1) const
 292       {
 293          switch (opcode) {
 294          case SHADER_OPCODE_POW:
 295          case SHADER_OPCODE_INT_QUOTIENT:
 296          case SHADER_OPCODE_INT_REMAINDER:
 297             return fix_math_instruction(
 298                emit(instruction(opcode, dispatch_width(), dst,
 299                                 fix_math_operand(src0),
 300                                 fix_math_operand(src1))));
 301
 302          default:
 303             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 304
 305          }
 306       }
 307
 308       /**
 309        * Create and insert a ternary instruction into the program.
 310        */
 311       instruction *
 312       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 313            const src_reg &src1, const src_reg &src2) const
 314       {
 315          switch (opcode) {
 316          case BRW_OPCODE_BFE:
 317          case BRW_OPCODE_BFI2:
 318          case BRW_OPCODE_MAD:
 319          case BRW_OPCODE_LRP:
 320             return emit(instruction(opcode, dispatch_width(), dst,
 321                                     fix_3src_operand(src0),
 322                                     fix_3src_operand(src1),
 323                                     fix_3src_operand(src2)));
 324
 325          default:
 326             return emit(instruction(opcode, dispatch_width(), dst,
 327                                     src0, src1, src2));
 328          }
 329       }
 330
 331       /**
 332        * Create and insert an instruction with a variable number of sources
 333        * into the program.
 334        */
 335       instruction *
 336       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 337            unsigned n) const
 338       {
 339          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 340       }
 341
 342       /**
 343        * Insert a preallocated instruction into the program.
 344        */
 345       instruction *
 346       emit(instruction *inst) const
 347       {
 348          assert(inst->exec_size <= 32);
 349          assert(inst->exec_size == dispatch_width() ||
 350                 force_writemask_all);
 351          assert(_group == 0 || _group == 8);
 352
 353          inst->force_sechalf = (_group == 8);
 354          inst->force_writemask_all = force_writemask_all;
 355          inst->annotation = annotation.str;
 356          inst->ir = annotation.ir;
 357
 358          if (block)
 359             static_cast<instruction *>(cursor)->insert_before(block, inst);
 360          else
 361             cursor->insert_before(inst);
 362
 363          return inst;
 364       }
 365
 366       /**
 367        * Select \p src0 if the comparison of both sources with the given
 368        * conditional mod evaluates to true, otherwise select \p src1.
 369        *
 370        * Generally useful to get the minimum or maximum of two values.
 371        */
 372       void
 373       emit_minmax(const dst_reg &dst, const src_reg &src0,
 374                   const src_reg &src1, brw_conditional_mod mod) const
 375       {
 376          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 377
 378          if (shader->devinfo->gen >= 6) {
 379             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 380                                  fix_unsigned_negate(src1)));
 381          } else {
 382             CMP(null_reg_d(), src0, src1, mod);
 383             set_predicate(BRW_PREDICATE_NORMAL,
 384                           SEL(dst, src0, src1));
 385          }
 386       }
 387
 388       /**
 389        * Copy any live channel from \p src to the first channel of the result.
 390        */
 391       src_reg
 392       emit_uniformize(const src_reg &src) const
 393       {
 394          /* FIXME: We use a vector chan_index and dst to allow constant and
 395           * copy propagration to move result all the way into the consuming
 396           * instruction (typically a surface index or sampler index for a
 397           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
 398           * dispatch. Once we teach const/copy propagation about scalars we
 399           * should go back to scalar destinations here.
 400           */
 401          const fs_builder ubld = exec_all();
 402          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 403          const dst_reg dst = vgrf(src.type);
 404
 405          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 406          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 407
 408          return src_reg(component(dst, 0));
 409       }
 410
 411       /**
 412        * Assorted arithmetic ops.
 413        * @{
 414        */
 415 #define ALU1(op)                                        \
 416       instruction *                                     \
 417       op(const dst_reg &dst, const src_reg &src0) const \
 418       {                                                 \
 419          return emit(BRW_OPCODE_##op, dst, src0);       \
 420       }
 421
 422 #define ALU2(op)                                                        \
 423       instruction *                                                     \
 424       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 425       {                                                                 \
 426          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 427       }
 428
 429 #define ALU2_ACC(op)                                                    \
 430       instruction *                                                     \
 431       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 432       {                                                                 \
 433          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 434          inst->writes_accumulator = true;                               \
 435          return inst;                                                   \
 436       }
 437
 438 #define ALU3(op)                                                        \
 439       instruction *                                                     \
 440       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 441          const src_reg &src2) const                                     \
 442       {                                                                 \
 443          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 444       }
 445
 446       ALU2(ADD)
 447       ALU2_ACC(ADDC)
 448       ALU2(AND)
 449       ALU2(ASR)
 450       ALU2(AVG)
 451       ALU3(BFE)
 452       ALU2(BFI1)
 453       ALU3(BFI2)
 454       ALU1(BFREV)
 455       ALU1(CBIT)
 456       ALU2(CMPN)
 457       ALU3(CSEL)
 458       ALU2(DP2)
 459       ALU2(DP3)
 460       ALU2(DP4)
 461       ALU2(DPH)
 462       ALU1(F16TO32)
 463       ALU1(F32TO16)
 464       ALU1(FBH)
 465       ALU1(FBL)
 466       ALU1(FRC)
 467       ALU2(LINE)
 468       ALU1(LZD)
 469       ALU2(MAC)
 470       ALU2_ACC(MACH)
 471       ALU3(MAD)
 472       ALU1(MOV)
 473       ALU2(MUL)
 474       ALU1(NOT)
 475       ALU2(OR)
 476       ALU2(PLN)
 477       ALU1(RNDD)
 478       ALU1(RNDE)
 479       ALU1(RNDU)
 480       ALU1(RNDZ)
 481       ALU2(SAD2)
 482       ALU2_ACC(SADA2)
 483       ALU2(SEL)
 484       ALU2(SHL)
 485       ALU2(SHR)
 486       ALU2_ACC(SUBB)
 487       ALU2(XOR)
 488
 489 #undef ALU3
 490 #undef ALU2_ACC
 491 #undef ALU2
 492 #undef ALU1
 493       /** @} */
 494
 495       /**
 496        * CMP: Sets the low bit of the destination channels with the result
 497        * of the comparison, while the upper bits are undefined, and updates
 498        * the flag register with the packed 16 bits of the result.
 499        */
 500       instruction *
 501       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 502           brw_conditional_mod condition) const
 503       {
 504          /* Take the instruction:
 505           *
 506           * CMP null<d> src0<f> src1<f>
 507           *
 508           * Original gen4 does type conversion to the destination type
 509           * before comparison, producing garbage results for floating
 510           * point comparisons.
 511           *
 512           * The destination type doesn't matter on newer generations,
 513           * so we set the type to match src0 so we can compact the
 514           * instruction.
 515           */
 516          return set_condmod(condition,
 517                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 518                                  fix_unsigned_negate(src0),
 519                                  fix_unsigned_negate(src1)));
 520       }
 521
 522       /**
 523        * Gen4 predicated IF.
 524        */
 525       instruction *
 526       IF(brw_predicate predicate) const
 527       {
 528          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 529       }
 530
 531       /**
 532        * Emit a linear interpolation instruction.
 533        */
 534       instruction *
 535       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 536           const src_reg &a) const
 537       {
 538          if (shader->devinfo->gen >= 6) {
 539             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 540              * we need to reorder the operands.
 541              */
 542             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 543
 544          } else {
 545             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 546             const dst_reg y_times_a = vgrf(dst.type);
 547             const dst_reg one_minus_a = vgrf(dst.type);
 548             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 549
 550             MUL(y_times_a, y, a);
 551             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 552             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 553             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 554          }
 555       }
 556
 557       /**
 558        * Collect a number of registers in a contiguous range of registers.
 559        */
 560       instruction *
 561       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 562                    unsigned sources, unsigned header_size) const
 563       {
 564          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 565          inst->header_size = header_size;
 566          inst->regs_written = header_size +
 567                               (sources - header_size) * (dispatch_width() / 8);
 568
 569          return inst;
 570       }
 571
 572       backend_shader *shader;
 573
 574    private:
 575       /**
 576        * Workaround for negation of UD registers.  See comment in
 577        * fs_generator::generate_code() for more details.
 578        */
 579       src_reg
 580       fix_unsigned_negate(const src_reg &src) const
 581       {
 582          if (src.type == BRW_REGISTER_TYPE_UD &&
 583              src.negate) {
 584             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 585             MOV(temp, src);
 586             return src_reg(temp);
 587          } else {
 588             return src;
 589          }
 590       }
 591
 592       /**
 593        * Workaround for source register modes not supported by the ternary
 594        * instruction encoding.
 595        */
 596       src_reg
 597       fix_3src_operand(const src_reg &src) const
 598       {
 599          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
 600             return src;
 601          } else {
 602             dst_reg expanded = vgrf(src.type);
 603             MOV(expanded, src);
 604             return expanded;
 605          }
 606       }
 607
 608       /**
 609        * Workaround for source register modes not supported by the math
 610        * instruction.
 611        */
 612       src_reg
 613       fix_math_operand(const src_reg &src) const
 614       {
 615          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 616           * might be able to do better by doing execsize = 1 math and then
 617           * expanding that result out, but we would need to be careful with
 618           * masking.
 619           *
 620           * Gen6 hardware ignores source modifiers (negate and abs) on math
 621           * instructions, so we also move to a temp to set those up.
 622           *
 623           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 624           * operands to math
 625           */
 626          if ((shader->devinfo->gen == 6 &&
 627               (src.file == IMM || src.file == UNIFORM ||
 628                src.abs || src.negate)) ||
 629              (shader->devinfo->gen == 7 && src.file == IMM)) {
 630             const dst_reg tmp = vgrf(src.type);
 631             MOV(tmp, src);
 632             return tmp;
 633          } else {
 634             return src;
 635          }
 636       }
 637
 638       /**
 639        * Workaround other weirdness of the math instruction.
 640        */
 641       instruction *
 642       fix_math_instruction(instruction *inst) const
 643       {
 644          if (shader->devinfo->gen < 6) {
 645             inst->base_mrf = 2;
 646             inst->mlen = inst->sources * dispatch_width() / 8;
 647
 648             if (inst->sources > 1) {
 649                /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 650                 * "Message Payload":
 651                 *
 652                 * "Operand0[7].  For the INT DIV functions, this operand is the
 653                 *  denominator."
 654                 *  ...
 655                 * "Operand1[7].  For the INT DIV functions, this operand is the
 656                 *  numerator."
 657                 */
 658                const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
 659                const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
 660                const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
 661
 662                inst->resize_sources(1);
 663                inst->src[0] = src0;
 664
 665                at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
 666                                    src1);
 667             }
 668          }
 669
 670          return inst;
 671       }
 672
 673       bblock_t *block;
 674       exec_node *cursor;
 675
 676       unsigned _dispatch_width;
 677       unsigned _group;
 678       bool force_writemask_all;
 679
 680       /** Debug annotation info. */
 681       struct {
 682          const char *str;
 683          const void *ir;
 684       } annotation;
 685    };
 686 }
 687
 688 #endif