src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions before \p cursor in
  68        * basic block \p block, inheriting other code generation parameters
  69        * from this.
  70        */
  71       fs_builder
  72       at(bblock_t *block, exec_node *cursor) const
  73       {
  74          fs_builder bld = *this;
  75          bld.block = block;
  76          bld.cursor = cursor;
  77          return bld;
  78       }
  79
  80       /**
  81        * Construct an fs_builder appending instructions at the end of the
  82        * instruction list of the shader, inheriting other code generation
  83        * parameters from this.
  84        */
  85       fs_builder
  86       at_end() const
  87       {
  88          return at(NULL, (exec_node *)&shader->instructions.tail);
  89       }
  90
  91       /**
  92        * Construct a builder specifying the default SIMD width and group of
  93        * channel enable signals, inheriting other code generation parameters
  94        * from this.
  95        *
  96        * \p n gives the default SIMD width, \p i gives the slot group used for
  97        * predication and control flow masking in multiples of \p n channels.
  98        */
  99       fs_builder
 100       group(unsigned n, unsigned i) const
 101       {
 102          assert(n <= dispatch_width() &&
 103                 i < dispatch_width() / n);
 104          fs_builder bld = *this;
 105          bld._dispatch_width = n;
 106          bld._group += i * n;
 107          return bld;
 108       }
 109
 110       /**
 111        * Alias for group() with width equal to eight.
 112        */
 113       fs_builder
 114       half(unsigned i) const
 115       {
 116          return group(8, i);
 117       }
 118
 119       /**
 120        * Construct a builder with per-channel control flow execution masking
 121        * disabled if \p b is true.  If control flow execution masking is
 122        * already disabled this has no effect.
 123        */
 124       fs_builder
 125       exec_all(bool b = true) const
 126       {
 127          fs_builder bld = *this;
 128          if (b)
 129             bld.force_writemask_all = true;
 130          return bld;
 131       }
 132
 133       /**
 134        * Construct a builder with the given debug annotation info.
 135        */
 136       fs_builder
 137       annotate(const char *str, const void *ir = NULL) const
 138       {
 139          fs_builder bld = *this;
 140          bld.annotation.str = str;
 141          bld.annotation.ir = ir;
 142          return bld;
 143       }
 144
 145       /**
 146        * Get the SIMD width in use.
 147        */
 148       unsigned
 149       dispatch_width() const
 150       {
 151          return _dispatch_width;
 152       }
 153
 154       /**
 155        * Allocate a virtual register of natural vector size (one for this IR)
 156        * and SIMD width.  \p n gives the amount of space to allocate in
 157        * dispatch_width units (which is just enough space for one logical
 158        * component in this IR).
 159        */
 160       dst_reg
 161       vgrf(enum brw_reg_type type, unsigned n = 1) const
 162       {
 163          return dst_reg(GRF, shader->alloc.allocate(
 164                            DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 165                                         REG_SIZE)),
 166                         type, dispatch_width());
 167       }
 168
 169       /**
 170        * Create a null register of floating type.
 171        */
 172       dst_reg
 173       null_reg_f() const
 174       {
 175          return dst_reg(retype(brw_null_vec(dispatch_width()),
 176                                BRW_REGISTER_TYPE_F));
 177       }
 178
 179       /**
 180        * Create a null register of signed integer type.
 181        */
 182       dst_reg
 183       null_reg_d() const
 184       {
 185          return dst_reg(retype(brw_null_vec(dispatch_width()),
 186                                BRW_REGISTER_TYPE_D));
 187       }
 188
 189       /**
 190        * Create a null register of unsigned integer type.
 191        */
 192       dst_reg
 193       null_reg_ud() const
 194       {
 195          return dst_reg(retype(brw_null_vec(dispatch_width()),
 196                                BRW_REGISTER_TYPE_UD));
 197       }
 198
 199       /**
 200        * Get the mask of SIMD channels enabled by dispatch and not yet
 201        * disabled by discard.
 202        */
 203       src_reg
 204       sample_mask_reg() const
 205       {
 206          const bool uses_kill =
 207             (shader->stage == MESA_SHADER_FRAGMENT &&
 208              ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
 209          return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
 210                  uses_kill ? brw_flag_reg(0, 1) :
 211                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
 212       }
 213
 214       /**
 215        * Insert an instruction into the program.
 216        */
 217       instruction *
 218       emit(const instruction &inst) const
 219       {
 220          return emit(new(shader->mem_ctx) instruction(inst));
 221       }
 222
 223       /**
 224        * Create and insert a nullary control instruction into the program.
 225        */
 226       instruction *
 227       emit(enum opcode opcode) const
 228       {
 229          return emit(instruction(opcode, dispatch_width()));
 230       }
 231
 232       /**
 233        * Create and insert a nullary instruction into the program.
 234        */
 235       instruction *
 236       emit(enum opcode opcode, const dst_reg &dst) const
 237       {
 238          return emit(instruction(opcode, dispatch_width(), dst));
 239       }
 240
 241       /**
 242        * Create and insert a unary instruction into the program.
 243        */
 244       instruction *
 245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 246       {
 247          switch (opcode) {
 248          case SHADER_OPCODE_RCP:
 249          case SHADER_OPCODE_RSQ:
 250          case SHADER_OPCODE_SQRT:
 251          case SHADER_OPCODE_EXP2:
 252          case SHADER_OPCODE_LOG2:
 253          case SHADER_OPCODE_SIN:
 254          case SHADER_OPCODE_COS:
 255             return fix_math_instruction(
 256                emit(instruction(opcode, dispatch_width(), dst,
 257                                 fix_math_operand(src0))));
 258
 259          default:
 260             return emit(instruction(opcode, dispatch_width(), dst, src0));
 261          }
 262       }
 263
 264       /**
 265        * Create and insert a binary instruction into the program.
 266        */
 267       instruction *
 268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 269            const src_reg &src1) const
 270       {
 271          switch (opcode) {
 272          case SHADER_OPCODE_POW:
 273          case SHADER_OPCODE_INT_QUOTIENT:
 274          case SHADER_OPCODE_INT_REMAINDER:
 275             return fix_math_instruction(
 276                emit(instruction(opcode, dispatch_width(), dst,
 277                                 fix_math_operand(src0),
 278                                 fix_math_operand(src1))));
 279
 280          default:
 281             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 282
 283          }
 284       }
 285
 286       /**
 287        * Create and insert a ternary instruction into the program.
 288        */
 289       instruction *
 290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 291            const src_reg &src1, const src_reg &src2) const
 292       {
 293          switch (opcode) {
 294          case BRW_OPCODE_BFE:
 295          case BRW_OPCODE_BFI2:
 296          case BRW_OPCODE_MAD:
 297          case BRW_OPCODE_LRP:
 298             return emit(instruction(opcode, dispatch_width(), dst,
 299                                     fix_3src_operand(src0),
 300                                     fix_3src_operand(src1),
 301                                     fix_3src_operand(src2)));
 302
 303          default:
 304             return emit(instruction(opcode, dispatch_width(), dst,
 305                                     src0, src1, src2));
 306          }
 307       }
 308
 309       /**
 310        * Insert a preallocated instruction into the program.
 311        */
 312       instruction *
 313       emit(instruction *inst) const
 314       {
 315          assert(inst->exec_size == dispatch_width() ||
 316                 force_writemask_all);
 317          assert(_group == 0 || _group == 8);
 318
 319          inst->force_sechalf = (_group == 8);
 320          inst->force_writemask_all = force_writemask_all;
 321          inst->annotation = annotation.str;
 322          inst->ir = annotation.ir;
 323
 324          if (block)
 325             static_cast<instruction *>(cursor)->insert_before(block, inst);
 326          else
 327             cursor->insert_before(inst);
 328
 329          return inst;
 330       }
 331
 332       /**
 333        * Select \p src0 if the comparison of both sources with the given
 334        * conditional mod evaluates to true, otherwise select \p src1.
 335        *
 336        * Generally useful to get the minimum or maximum of two values.
 337        */
 338       void
 339       emit_minmax(const dst_reg &dst, const src_reg &src0,
 340                   const src_reg &src1, brw_conditional_mod mod) const
 341       {
 342          if (shader->devinfo->gen >= 6) {
 343             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 344                                  fix_unsigned_negate(src1)));
 345          } else {
 346             CMP(null_reg_d(), src0, src1, mod);
 347             set_predicate(BRW_PREDICATE_NORMAL,
 348                           SEL(dst, src0, src1));
 349          }
 350       }
 351
 352       /**
 353        * Copy any live channel from \p src to the first channel of \p dst.
 354        */
 355       void
 356       emit_uniformize(const dst_reg &dst, const src_reg &src) const
 357       {
 358          const fs_builder ubld = exec_all();
 359          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 360
 361          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
 362          ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
 363                    src, component(chan_index, 0));
 364       }
 365
 366       /**
 367        * Assorted arithmetic ops.
 368        * @{
 369        */
 370 #define ALU1(op)                                        \
 371       instruction *                                     \
 372       op(const dst_reg &dst, const src_reg &src0) const \
 373       {                                                 \
 374          return emit(BRW_OPCODE_##op, dst, src0);       \
 375       }
 376
 377 #define ALU2(op)                                                        \
 378       instruction *                                                     \
 379       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 380       {                                                                 \
 381          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 382       }
 383
 384 #define ALU2_ACC(op)                                                    \
 385       instruction *                                                     \
 386       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 387       {                                                                 \
 388          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 389          inst->writes_accumulator = true;                               \
 390          return inst;                                                   \
 391       }
 392
 393 #define ALU3(op)                                                        \
 394       instruction *                                                     \
 395       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 396          const src_reg &src2) const                                     \
 397       {                                                                 \
 398          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 399       }
 400
 401       ALU2(ADD)
 402       ALU2_ACC(ADDC)
 403       ALU2(AND)
 404       ALU2(ASR)
 405       ALU2(AVG)
 406       ALU3(BFE)
 407       ALU2(BFI1)
 408       ALU3(BFI2)
 409       ALU1(BFREV)
 410       ALU1(CBIT)
 411       ALU2(CMPN)
 412       ALU3(CSEL)
 413       ALU2(DP2)
 414       ALU2(DP3)
 415       ALU2(DP4)
 416       ALU2(DPH)
 417       ALU1(F16TO32)
 418       ALU1(F32TO16)
 419       ALU1(FBH)
 420       ALU1(FBL)
 421       ALU1(FRC)
 422       ALU2(LINE)
 423       ALU1(LZD)
 424       ALU2(MAC)
 425       ALU2_ACC(MACH)
 426       ALU3(MAD)
 427       ALU1(MOV)
 428       ALU2(MUL)
 429       ALU1(NOT)
 430       ALU2(OR)
 431       ALU2(PLN)
 432       ALU1(RNDD)
 433       ALU1(RNDE)
 434       ALU1(RNDU)
 435       ALU1(RNDZ)
 436       ALU2(SAD2)
 437       ALU2_ACC(SADA2)
 438       ALU2(SEL)
 439       ALU2(SHL)
 440       ALU2(SHR)
 441       ALU2_ACC(SUBB)
 442       ALU2(XOR)
 443
 444 #undef ALU3
 445 #undef ALU2_ACC
 446 #undef ALU2
 447 #undef ALU1
 448       /** @} */
 449
 450       /**
 451        * CMP: Sets the low bit of the destination channels with the result
 452        * of the comparison, while the upper bits are undefined, and updates
 453        * the flag register with the packed 16 bits of the result.
 454        */
 455       instruction *
 456       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 457           brw_conditional_mod condition) const
 458       {
 459          /* Take the instruction:
 460           *
 461           * CMP null<d> src0<f> src1<f>
 462           *
 463           * Original gen4 does type conversion to the destination type
 464           * before comparison, producing garbage results for floating
 465           * point comparisons.
 466           *
 467           * The destination type doesn't matter on newer generations,
 468           * so we set the type to match src0 so we can compact the
 469           * instruction.
 470           */
 471          return set_condmod(condition,
 472                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 473                                  fix_unsigned_negate(src0),
 474                                  fix_unsigned_negate(src1)));
 475       }
 476
 477       /**
 478        * Gen4 predicated IF.
 479        */
 480       instruction *
 481       IF(brw_predicate predicate) const
 482       {
 483          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 484       }
 485
 486       /**
 487        * Emit a linear interpolation instruction.
 488        */
 489       instruction *
 490       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 491           const src_reg &a) const
 492       {
 493          if (shader->devinfo->gen >= 6) {
 494             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 495              * we need to reorder the operands.
 496              */
 497             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 498
 499          } else {
 500             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 501             const dst_reg y_times_a = vgrf(dst.type);
 502             const dst_reg one_minus_a = vgrf(dst.type);
 503             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 504
 505             MUL(y_times_a, y, a);
 506             ADD(one_minus_a, negate(a), src_reg(1.0f));
 507             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 508             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 509          }
 510       }
 511
 512       /**
 513        * Collect a number of registers in a contiguous range of registers.
 514        */
 515       instruction *
 516       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 517                    unsigned sources, unsigned header_size) const
 518       {
 519          assert(dst.width % 8 == 0);
 520          instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
 521                                               dispatch_width(), dst,
 522                                               src, sources));
 523          inst->header_size = header_size;
 524
 525          for (unsigned i = 0; i < header_size; i++)
 526             assert(src[i].file != GRF ||
 527                    src[i].width * type_sz(src[i].type) == 32);
 528          inst->regs_written = header_size;
 529
 530          for (unsigned i = header_size; i < sources; ++i)
 531             assert(src[i].file != GRF ||
 532                    src[i].width == dst.width);
 533          inst->regs_written += (sources - header_size) * (dispatch_width() / 8);
 534
 535          return inst;
 536       }
 537
 538       backend_shader *shader;
 539
 540    private:
 541       /**
 542        * Workaround for negation of UD registers.  See comment in
 543        * fs_generator::generate_code() for more details.
 544        */
 545       src_reg
 546       fix_unsigned_negate(const src_reg &src) const
 547       {
 548          if (src.type == BRW_REGISTER_TYPE_UD &&
 549              src.negate) {
 550             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 551             MOV(temp, src);
 552             return src_reg(temp);
 553          } else {
 554             return src;
 555          }
 556       }
 557
 558       /**
 559        * Workaround for source register modes not supported by the ternary
 560        * instruction encoding.
 561        */
 562       src_reg
 563       fix_3src_operand(const src_reg &src) const
 564       {
 565          if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
 566             return src;
 567          } else {
 568             dst_reg expanded = vgrf(src.type);
 569             MOV(expanded, src);
 570             return expanded;
 571          }
 572       }
 573
 574       /**
 575        * Workaround for source register modes not supported by the math
 576        * instruction.
 577        */
 578       src_reg
 579       fix_math_operand(const src_reg &src) const
 580       {
 581          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 582           * might be able to do better by doing execsize = 1 math and then
 583           * expanding that result out, but we would need to be careful with
 584           * masking.
 585           *
 586           * Gen6 hardware ignores source modifiers (negate and abs) on math
 587           * instructions, so we also move to a temp to set those up.
 588           *
 589           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 590           * operands to math
 591           */
 592          if ((shader->devinfo->gen == 6 &&
 593               (src.file == IMM || src.file == UNIFORM ||
 594                src.abs || src.negate)) ||
 595              (shader->devinfo->gen == 7 && src.file == IMM)) {
 596             const dst_reg tmp = vgrf(src.type);
 597             MOV(tmp, src);
 598             return tmp;
 599          } else {
 600             return src;
 601          }
 602       }
 603
 604       /**
 605        * Workaround other weirdness of the math instruction.
 606        */
 607       instruction *
 608       fix_math_instruction(instruction *inst) const
 609       {
 610          if (shader->devinfo->gen < 6) {
 611             inst->base_mrf = 2;
 612             inst->mlen = inst->sources * dispatch_width() / 8;
 613
 614             if (inst->sources > 1) {
 615                /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 616                 * "Message Payload":
 617                 *
 618                 * "Operand0[7].  For the INT DIV functions, this operand is the
 619                 *  denominator."
 620                 *  ...
 621                 * "Operand1[7].  For the INT DIV functions, this operand is the
 622                 *  numerator."
 623                 */
 624                const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
 625                const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
 626                const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
 627
 628                inst->resize_sources(1);
 629                inst->src[0] = src0;
 630
 631                at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
 632                                           dispatch_width()), src1);
 633             }
 634          }
 635
 636          return inst;
 637       }
 638
 639       bblock_t *block;
 640       exec_node *cursor;
 641
 642       unsigned _dispatch_width;
 643       unsigned _group;
 644       bool force_writemask_all;
 645
 646       /** Debug annotation info. */
 647       struct {
 648          const char *str;
 649          const void *ir;
 650       } annotation;
 651    };
 652 }
 653
 654 #endif