src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions before \p cursor in
  68        * basic block \p block, inheriting other code generation parameters
  69        * from this.
  70        */
  71       fs_builder
  72       at(bblock_t *block, exec_node *cursor) const
  73       {
  74          fs_builder bld = *this;
  75          bld.block = block;
  76          bld.cursor = cursor;
  77          return bld;
  78       }
  79
  80       /**
  81        * Construct an fs_builder appending instructions at the end of the
  82        * instruction list of the shader, inheriting other code generation
  83        * parameters from this.
  84        */
  85       fs_builder
  86       at_end() const
  87       {
  88          return at(NULL, (exec_node *)&shader->instructions.tail);
  89       }
  90
  91       /**
  92        * Construct a builder specifying the default SIMD width and group of
  93        * channel enable signals, inheriting other code generation parameters
  94        * from this.
  95        *
  96        * \p n gives the default SIMD width, \p i gives the slot group used for
  97        * predication and control flow masking in multiples of \p n channels.
  98        */
  99       fs_builder
 100       group(unsigned n, unsigned i) const
 101       {
 102          assert(n <= dispatch_width() &&
 103                 i < dispatch_width() / n);
 104          fs_builder bld = *this;
 105          bld._dispatch_width = n;
 106          bld._group += i * n;
 107          return bld;
 108       }
 109
 110       /**
 111        * Alias for group() with width equal to eight.
 112        */
 113       fs_builder
 114       half(unsigned i) const
 115       {
 116          return group(8, i);
 117       }
 118
 119       /**
 120        * Construct a builder with per-channel control flow execution masking
 121        * disabled if \p b is true.  If control flow execution masking is
 122        * already disabled this has no effect.
 123        */
 124       fs_builder
 125       exec_all(bool b = true) const
 126       {
 127          fs_builder bld = *this;
 128          if (b)
 129             bld.force_writemask_all = true;
 130          return bld;
 131       }
 132
 133       /**
 134        * Construct a builder with the given debug annotation info.
 135        */
 136       fs_builder
 137       annotate(const char *str, const void *ir = NULL) const
 138       {
 139          fs_builder bld = *this;
 140          bld.annotation.str = str;
 141          bld.annotation.ir = ir;
 142          return bld;
 143       }
 144
 145       /**
 146        * Get the SIMD width in use.
 147        */
 148       unsigned
 149       dispatch_width() const
 150       {
 151          return _dispatch_width;
 152       }
 153
 154       /**
 155        * Allocate a virtual register of natural vector size (one for this IR)
 156        * and SIMD width.  \p n gives the amount of space to allocate in
 157        * dispatch_width units (which is just enough space for one logical
 158        * component in this IR).
 159        */
 160       dst_reg
 161       vgrf(enum brw_reg_type type, unsigned n = 1) const
 162       {
 163          return dst_reg(GRF, shader->alloc.allocate(
 164                            DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 165                                         REG_SIZE)),
 166                         type, dispatch_width());
 167       }
 168
 169       /**
 170        * Create a null register of floating type.
 171        */
 172       dst_reg
 173       null_reg_f() const
 174       {
 175          return dst_reg(retype(brw_null_vec(dispatch_width()),
 176                                BRW_REGISTER_TYPE_F));
 177       }
 178
 179       /**
 180        * Create a null register of signed integer type.
 181        */
 182       dst_reg
 183       null_reg_d() const
 184       {
 185          return dst_reg(retype(brw_null_vec(dispatch_width()),
 186                                BRW_REGISTER_TYPE_D));
 187       }
 188
 189       /**
 190        * Create a null register of unsigned integer type.
 191        */
 192       dst_reg
 193       null_reg_ud() const
 194       {
 195          return dst_reg(retype(brw_null_vec(dispatch_width()),
 196                                BRW_REGISTER_TYPE_UD));
 197       }
 198
 199       /**
 200        * Get the mask of SIMD channels enabled by dispatch and not yet
 201        * disabled by discard.
 202        */
 203       src_reg
 204       sample_mask_reg() const
 205       {
 206          const bool uses_kill =
 207             (shader->stage == MESA_SHADER_FRAGMENT &&
 208              ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
 209          return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
 210                  uses_kill ? brw_flag_reg(0, 1) :
 211                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
 212       }
 213
 214       /**
 215        * Insert an instruction into the program.
 216        */
 217       instruction *
 218       emit(const instruction &inst) const
 219       {
 220          return emit(new(shader->mem_ctx) instruction(inst));
 221       }
 222
 223       /**
 224        * Create and insert a nullary control instruction into the program.
 225        */
 226       instruction *
 227       emit(enum opcode opcode) const
 228       {
 229          return emit(instruction(opcode, dispatch_width()));
 230       }
 231
 232       /**
 233        * Create and insert a nullary instruction into the program.
 234        */
 235       instruction *
 236       emit(enum opcode opcode, const dst_reg &dst) const
 237       {
 238          return emit(instruction(opcode, dst));
 239       }
 240
 241       /**
 242        * Create and insert a unary instruction into the program.
 243        */
 244       instruction *
 245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 246       {
 247          switch (opcode) {
 248          case SHADER_OPCODE_RCP:
 249          case SHADER_OPCODE_RSQ:
 250          case SHADER_OPCODE_SQRT:
 251          case SHADER_OPCODE_EXP2:
 252          case SHADER_OPCODE_LOG2:
 253          case SHADER_OPCODE_SIN:
 254          case SHADER_OPCODE_COS:
 255             return fix_math_instruction(
 256                emit(instruction(opcode, dst.width, dst,
 257                                 fix_math_operand(src0))));
 258
 259          default:
 260             return emit(instruction(opcode, dst.width, dst, src0));
 261          }
 262       }
 263
 264       /**
 265        * Create and insert a binary instruction into the program.
 266        */
 267       instruction *
 268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 269            const src_reg &src1) const
 270       {
 271          switch (opcode) {
 272          case SHADER_OPCODE_POW:
 273          case SHADER_OPCODE_INT_QUOTIENT:
 274          case SHADER_OPCODE_INT_REMAINDER:
 275             return fix_math_instruction(
 276                emit(instruction(opcode, dst.width, dst,
 277                                 fix_math_operand(src0),
 278                                 fix_math_operand(src1))));
 279
 280          default:
 281             return emit(instruction(opcode, dst.width, dst, src0, src1));
 282
 283          }
 284       }
 285
 286       /**
 287        * Create and insert a ternary instruction into the program.
 288        */
 289       instruction *
 290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 291            const src_reg &src1, const src_reg &src2) const
 292       {
 293          switch (opcode) {
 294          case BRW_OPCODE_BFE:
 295          case BRW_OPCODE_BFI2:
 296          case BRW_OPCODE_MAD:
 297          case BRW_OPCODE_LRP:
 298             return emit(instruction(opcode, dst.width, dst,
 299                                     fix_3src_operand(src0),
 300                                     fix_3src_operand(src1),
 301                                     fix_3src_operand(src2)));
 302
 303          default:
 304             return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
 305          }
 306       }
 307
 308       /**
 309        * Insert a preallocated instruction into the program.
 310        */
 311       instruction *
 312       emit(instruction *inst) const
 313       {
 314          assert(inst->exec_size == dispatch_width() ||
 315                 force_writemask_all);
 316          assert(_group == 0 || _group == 8);
 317
 318          inst->force_sechalf = (_group == 8);
 319          inst->force_writemask_all = force_writemask_all;
 320          inst->annotation = annotation.str;
 321          inst->ir = annotation.ir;
 322
 323          if (block)
 324             static_cast<instruction *>(cursor)->insert_before(block, inst);
 325          else
 326             cursor->insert_before(inst);
 327
 328          return inst;
 329       }
 330
 331       /**
 332        * Select \p src0 if the comparison of both sources with the given
 333        * conditional mod evaluates to true, otherwise select \p src1.
 334        *
 335        * Generally useful to get the minimum or maximum of two values.
 336        */
 337       void
 338       emit_minmax(const dst_reg &dst, const src_reg &src0,
 339                   const src_reg &src1, brw_conditional_mod mod) const
 340       {
 341          if (shader->devinfo->gen >= 6) {
 342             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 343                                  fix_unsigned_negate(src1)));
 344          } else {
 345             CMP(null_reg_d(), src0, src1, mod);
 346             set_predicate(BRW_PREDICATE_NORMAL,
 347                           SEL(dst, src0, src1));
 348          }
 349       }
 350
 351       /**
 352        * Copy any live channel from \p src to the first channel of \p dst.
 353        */
 354       void
 355       emit_uniformize(const dst_reg &dst, const src_reg &src) const
 356       {
 357          const fs_builder ubld = exec_all();
 358          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
 359
 360          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
 361          ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
 362                    src, component(chan_index, 0));
 363       }
 364
 365       /**
 366        * Assorted arithmetic ops.
 367        * @{
 368        */
 369 #define ALU1(op)                                        \
 370       instruction *                                     \
 371       op(const dst_reg &dst, const src_reg &src0) const \
 372       {                                                 \
 373          return emit(BRW_OPCODE_##op, dst, src0);       \
 374       }
 375
 376 #define ALU2(op)                                                        \
 377       instruction *                                                     \
 378       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 379       {                                                                 \
 380          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 381       }
 382
 383 #define ALU2_ACC(op)                                                    \
 384       instruction *                                                     \
 385       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 386       {                                                                 \
 387          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 388          inst->writes_accumulator = true;                               \
 389          return inst;                                                   \
 390       }
 391
 392 #define ALU3(op)                                                        \
 393       instruction *                                                     \
 394       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 395          const src_reg &src2) const                                     \
 396       {                                                                 \
 397          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 398       }
 399
 400       ALU2(ADD)
 401       ALU2_ACC(ADDC)
 402       ALU2(AND)
 403       ALU2(ASR)
 404       ALU2(AVG)
 405       ALU3(BFE)
 406       ALU2(BFI1)
 407       ALU3(BFI2)
 408       ALU1(BFREV)
 409       ALU1(CBIT)
 410       ALU2(CMPN)
 411       ALU3(CSEL)
 412       ALU2(DP2)
 413       ALU2(DP3)
 414       ALU2(DP4)
 415       ALU2(DPH)
 416       ALU1(F16TO32)
 417       ALU1(F32TO16)
 418       ALU1(FBH)
 419       ALU1(FBL)
 420       ALU1(FRC)
 421       ALU2(LINE)
 422       ALU1(LZD)
 423       ALU2(MAC)
 424       ALU2_ACC(MACH)
 425       ALU3(MAD)
 426       ALU1(MOV)
 427       ALU2(MUL)
 428       ALU1(NOT)
 429       ALU2(OR)
 430       ALU2(PLN)
 431       ALU1(RNDD)
 432       ALU1(RNDE)
 433       ALU1(RNDU)
 434       ALU1(RNDZ)
 435       ALU2(SAD2)
 436       ALU2_ACC(SADA2)
 437       ALU2(SEL)
 438       ALU2(SHL)
 439       ALU2(SHR)
 440       ALU2_ACC(SUBB)
 441       ALU2(XOR)
 442
 443 #undef ALU3
 444 #undef ALU2_ACC
 445 #undef ALU2
 446 #undef ALU1
 447       /** @} */
 448
 449       /**
 450        * CMP: Sets the low bit of the destination channels with the result
 451        * of the comparison, while the upper bits are undefined, and updates
 452        * the flag register with the packed 16 bits of the result.
 453        */
 454       instruction *
 455       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 456           brw_conditional_mod condition) const
 457       {
 458          /* Take the instruction:
 459           *
 460           * CMP null<d> src0<f> src1<f>
 461           *
 462           * Original gen4 does type conversion to the destination type
 463           * before comparison, producing garbage results for floating
 464           * point comparisons.
 465           *
 466           * The destination type doesn't matter on newer generations,
 467           * so we set the type to match src0 so we can compact the
 468           * instruction.
 469           */
 470          return set_condmod(condition,
 471                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 472                                  fix_unsigned_negate(src0),
 473                                  fix_unsigned_negate(src1)));
 474       }
 475
 476       /**
 477        * Gen4 predicated IF.
 478        */
 479       instruction *
 480       IF(brw_predicate predicate) const
 481       {
 482          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 483       }
 484
 485       /**
 486        * Emit a linear interpolation instruction.
 487        */
 488       instruction *
 489       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 490           const src_reg &a) const
 491       {
 492          if (shader->devinfo->gen >= 6) {
 493             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 494              * we need to reorder the operands.
 495              */
 496             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 497
 498          } else {
 499             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 500             const dst_reg y_times_a = vgrf(dst.type);
 501             const dst_reg one_minus_a = vgrf(dst.type);
 502             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 503
 504             MUL(y_times_a, y, a);
 505             ADD(one_minus_a, negate(a), src_reg(1.0f));
 506             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 507             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 508          }
 509       }
 510
 511       /**
 512        * Collect a number of registers in a contiguous range of registers.
 513        */
 514       instruction *
 515       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 516                    unsigned sources, unsigned header_size) const
 517       {
 518          assert(dst.width % 8 == 0);
 519          instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
 520                                               dst.width, dst, src, sources));
 521          inst->header_size = header_size;
 522
 523          for (unsigned i = 0; i < header_size; i++)
 524             assert(src[i].file != GRF ||
 525                    src[i].width * type_sz(src[i].type) == 32);
 526          inst->regs_written = header_size;
 527
 528          for (unsigned i = header_size; i < sources; ++i)
 529             assert(src[i].file != GRF ||
 530                    src[i].width == dst.width);
 531          inst->regs_written += (sources - header_size) * (dst.width / 8);
 532
 533          return inst;
 534       }
 535
 536       backend_shader *shader;
 537
 538    private:
 539       /**
 540        * Workaround for negation of UD registers.  See comment in
 541        * fs_generator::generate_code() for more details.
 542        */
 543       src_reg
 544       fix_unsigned_negate(const src_reg &src) const
 545       {
 546          if (src.type == BRW_REGISTER_TYPE_UD &&
 547              src.negate) {
 548             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 549             MOV(temp, src);
 550             return src_reg(temp);
 551          } else {
 552             return src;
 553          }
 554       }
 555
 556       /**
 557        * Workaround for source register modes not supported by the ternary
 558        * instruction encoding.
 559        */
 560       src_reg
 561       fix_3src_operand(const src_reg &src) const
 562       {
 563          if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
 564             return src;
 565          } else {
 566             dst_reg expanded = vgrf(src.type);
 567             MOV(expanded, src);
 568             return expanded;
 569          }
 570       }
 571
 572       /**
 573        * Workaround for source register modes not supported by the math
 574        * instruction.
 575        */
 576       src_reg
 577       fix_math_operand(const src_reg &src) const
 578       {
 579          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 580           * might be able to do better by doing execsize = 1 math and then
 581           * expanding that result out, but we would need to be careful with
 582           * masking.
 583           *
 584           * Gen6 hardware ignores source modifiers (negate and abs) on math
 585           * instructions, so we also move to a temp to set those up.
 586           *
 587           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 588           * operands to math
 589           */
 590          if ((shader->devinfo->gen == 6 &&
 591               (src.file == IMM || src.file == UNIFORM ||
 592                src.abs || src.negate)) ||
 593              (shader->devinfo->gen == 7 && src.file == IMM)) {
 594             const dst_reg tmp = vgrf(src.type);
 595             MOV(tmp, src);
 596             return tmp;
 597          } else {
 598             return src;
 599          }
 600       }
 601
 602       /**
 603        * Workaround other weirdness of the math instruction.
 604        */
 605       instruction *
 606       fix_math_instruction(instruction *inst) const
 607       {
 608          if (shader->devinfo->gen < 6) {
 609             inst->base_mrf = 2;
 610             inst->mlen = inst->sources * dispatch_width() / 8;
 611
 612             if (inst->sources > 1) {
 613                /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 614                 * "Message Payload":
 615                 *
 616                 * "Operand0[7].  For the INT DIV functions, this operand is the
 617                 *  denominator."
 618                 *  ...
 619                 * "Operand1[7].  For the INT DIV functions, this operand is the
 620                 *  numerator."
 621                 */
 622                const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
 623                const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
 624                const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
 625
 626                inst->resize_sources(1);
 627                inst->src[0] = src0;
 628
 629                at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
 630                                           dispatch_width()), src1);
 631             }
 632          }
 633
 634          return inst;
 635       }
 636
 637       bblock_t *block;
 638       exec_node *cursor;
 639
 640       unsigned _dispatch_width;
 641       unsigned _group;
 642       bool force_writemask_all;
 643
 644       /** Debug annotation info. */
 645       struct {
 646          const char *str;
 647          const void *ir;
 648       } annotation;
 649    };
 650 }
 651
 652 #endif