src/mesa/drivers/dri/i965/brw_fs_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_FS_BUILDER_H
  26 #define BRW_FS_BUILDER_H
  27
  28 #include "brw_ir_fs.h"
  29 #include "brw_shader.h"
  30 #include "brw_context.h"
  31
  32 namespace brw {
  33    /**
  34     * Toolbox to assemble an FS IR program out of individual instructions.
  35     *
  36     * This object is meant to have an interface consistent with
  37     * brw::vec4_builder.  They cannot be fully interchangeable because
  38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  39     * vector code.
  40     */
  41    class fs_builder {
  42    public:
  43       /** Type used in this IR to represent a source of an instruction. */
  44       typedef fs_reg src_reg;
  45
  46       /** Type used in this IR to represent the destination of an instruction. */
  47       typedef fs_reg dst_reg;
  48
  49       /** Type used in this IR to represent an instruction. */
  50       typedef fs_inst instruction;
  51
  52       /**
  53        * Construct an fs_builder that inserts instructions into \p shader.
  54        * \p dispatch_width gives the native execution width of the program.
  55        */
  56       fs_builder(backend_shader *shader,
  57                  unsigned dispatch_width) :
  58          shader(shader), block(NULL), cursor(NULL),
  59          _dispatch_width(dispatch_width),
  60          _group(0),
  61          force_writemask_all(false),
  62          annotation()
  63       {
  64       }
  65
  66       /**
  67        * Construct an fs_builder that inserts instructions before \p cursor in
  68        * basic block \p block, inheriting other code generation parameters
  69        * from this.
  70        */
  71       fs_builder
  72       at(bblock_t *block, exec_node *cursor) const
  73       {
  74          fs_builder bld = *this;
  75          bld.block = block;
  76          bld.cursor = cursor;
  77          return bld;
  78       }
  79
  80       /**
  81        * Construct an fs_builder appending instructions at the end of the
  82        * instruction list of the shader, inheriting other code generation
  83        * parameters from this.
  84        */
  85       fs_builder
  86       at_end() const
  87       {
  88          return at(NULL, (exec_node *)&shader->instructions.tail);
  89       }
  90
  91       /**
  92        * Construct a builder specifying the default SIMD width and group of
  93        * channel enable signals, inheriting other code generation parameters
  94        * from this.
  95        *
  96        * \p n gives the default SIMD width, \p i gives the slot group used for
  97        * predication and control flow masking in multiples of \p n channels.
  98        */
  99       fs_builder
 100       group(unsigned n, unsigned i) const
 101       {
 102          assert(force_writemask_all ||
 103                 (n <= dispatch_width() && i < dispatch_width() / n));
 104          fs_builder bld = *this;
 105          bld._dispatch_width = n;
 106          bld._group += i * n;
 107          return bld;
 108       }
 109
 110       /**
 111        * Alias for group() with width equal to eight.
 112        */
 113       fs_builder
 114       half(unsigned i) const
 115       {
 116          return group(8, i);
 117       }
 118
 119       /**
 120        * Construct a builder with per-channel control flow execution masking
 121        * disabled if \p b is true.  If control flow execution masking is
 122        * already disabled this has no effect.
 123        */
 124       fs_builder
 125       exec_all(bool b = true) const
 126       {
 127          fs_builder bld = *this;
 128          if (b)
 129             bld.force_writemask_all = true;
 130          return bld;
 131       }
 132
 133       /**
 134        * Construct a builder with the given debug annotation info.
 135        */
 136       fs_builder
 137       annotate(const char *str, const void *ir = NULL) const
 138       {
 139          fs_builder bld = *this;
 140          bld.annotation.str = str;
 141          bld.annotation.ir = ir;
 142          return bld;
 143       }
 144
 145       /**
 146        * Get the SIMD width in use.
 147        */
 148       unsigned
 149       dispatch_width() const
 150       {
 151          return _dispatch_width;
 152       }
 153
 154       /**
 155        * Allocate a virtual register of natural vector size (one for this IR)
 156        * and SIMD width.  \p n gives the amount of space to allocate in
 157        * dispatch_width units (which is just enough space for one logical
 158        * component in this IR).
 159        */
 160       dst_reg
 161       vgrf(enum brw_reg_type type, unsigned n = 1) const
 162       {
 163          return dst_reg(GRF, shader->alloc.allocate(
 164                            DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
 165                                         REG_SIZE)),
 166                         type);
 167       }
 168
 169       /**
 170        * Create a null register of floating type.
 171        */
 172       dst_reg
 173       null_reg_f() const
 174       {
 175          return dst_reg(retype(brw_null_vec(dispatch_width()),
 176                                BRW_REGISTER_TYPE_F));
 177       }
 178
 179       /**
 180        * Create a null register of signed integer type.
 181        */
 182       dst_reg
 183       null_reg_d() const
 184       {
 185          return dst_reg(retype(brw_null_vec(dispatch_width()),
 186                                BRW_REGISTER_TYPE_D));
 187       }
 188
 189       /**
 190        * Create a null register of unsigned integer type.
 191        */
 192       dst_reg
 193       null_reg_ud() const
 194       {
 195          return dst_reg(retype(brw_null_vec(dispatch_width()),
 196                                BRW_REGISTER_TYPE_UD));
 197       }
 198
 199       /**
 200        * Get the mask of SIMD channels enabled by dispatch and not yet
 201        * disabled by discard.
 202        */
 203       src_reg
 204       sample_mask_reg() const
 205       {
 206          const bool uses_kill =
 207             (shader->stage == MESA_SHADER_FRAGMENT &&
 208              ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
 209          return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
 210                  uses_kill ? brw_flag_reg(0, 1) :
 211                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
 212       }
 213
 214       /**
 215        * Insert an instruction into the program.
 216        */
 217       instruction *
 218       emit(const instruction &inst) const
 219       {
 220          return emit(new(shader->mem_ctx) instruction(inst));
 221       }
 222
 223       /**
 224        * Create and insert a nullary control instruction into the program.
 225        */
 226       instruction *
 227       emit(enum opcode opcode) const
 228       {
 229          return emit(instruction(opcode, dispatch_width()));
 230       }
 231
 232       /**
 233        * Create and insert a nullary instruction into the program.
 234        */
 235       instruction *
 236       emit(enum opcode opcode, const dst_reg &dst) const
 237       {
 238          return emit(instruction(opcode, dispatch_width(), dst));
 239       }
 240
 241       /**
 242        * Create and insert a unary instruction into the program.
 243        */
 244       instruction *
 245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 246       {
 247          switch (opcode) {
 248          case SHADER_OPCODE_RCP:
 249          case SHADER_OPCODE_RSQ:
 250          case SHADER_OPCODE_SQRT:
 251          case SHADER_OPCODE_EXP2:
 252          case SHADER_OPCODE_LOG2:
 253          case SHADER_OPCODE_SIN:
 254          case SHADER_OPCODE_COS:
 255             return fix_math_instruction(
 256                emit(instruction(opcode, dispatch_width(), dst,
 257                                 fix_math_operand(src0))));
 258
 259          default:
 260             return emit(instruction(opcode, dispatch_width(), dst, src0));
 261          }
 262       }
 263
 264       /**
 265        * Create and insert a binary instruction into the program.
 266        */
 267       instruction *
 268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 269            const src_reg &src1) const
 270       {
 271          switch (opcode) {
 272          case SHADER_OPCODE_POW:
 273          case SHADER_OPCODE_INT_QUOTIENT:
 274          case SHADER_OPCODE_INT_REMAINDER:
 275             return fix_math_instruction(
 276                emit(instruction(opcode, dispatch_width(), dst,
 277                                 fix_math_operand(src0),
 278                                 fix_math_operand(src1))));
 279
 280          default:
 281             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
 282
 283          }
 284       }
 285
 286       /**
 287        * Create and insert a ternary instruction into the program.
 288        */
 289       instruction *
 290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 291            const src_reg &src1, const src_reg &src2) const
 292       {
 293          switch (opcode) {
 294          case BRW_OPCODE_BFE:
 295          case BRW_OPCODE_BFI2:
 296          case BRW_OPCODE_MAD:
 297          case BRW_OPCODE_LRP:
 298             return emit(instruction(opcode, dispatch_width(), dst,
 299                                     fix_3src_operand(src0),
 300                                     fix_3src_operand(src1),
 301                                     fix_3src_operand(src2)));
 302
 303          default:
 304             return emit(instruction(opcode, dispatch_width(), dst,
 305                                     src0, src1, src2));
 306          }
 307       }
 308
 309       /**
 310        * Create and insert an instruction with a variable number of sources
 311        * into the program.
 312        */
 313       instruction *
 314       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
 315            unsigned n) const
 316       {
 317          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
 318       }
 319
 320       /**
 321        * Insert a preallocated instruction into the program.
 322        */
 323       instruction *
 324       emit(instruction *inst) const
 325       {
 326          assert(inst->exec_size == dispatch_width() ||
 327                 force_writemask_all);
 328          assert(_group == 0 || _group == 8);
 329
 330          inst->force_sechalf = (_group == 8);
 331          inst->force_writemask_all = force_writemask_all;
 332          inst->annotation = annotation.str;
 333          inst->ir = annotation.ir;
 334
 335          if (block)
 336             static_cast<instruction *>(cursor)->insert_before(block, inst);
 337          else
 338             cursor->insert_before(inst);
 339
 340          return inst;
 341       }
 342
 343       /**
 344        * Select \p src0 if the comparison of both sources with the given
 345        * conditional mod evaluates to true, otherwise select \p src1.
 346        *
 347        * Generally useful to get the minimum or maximum of two values.
 348        */
 349       void
 350       emit_minmax(const dst_reg &dst, const src_reg &src0,
 351                   const src_reg &src1, brw_conditional_mod mod) const
 352       {
 353          if (shader->devinfo->gen >= 6) {
 354             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 355                                  fix_unsigned_negate(src1)));
 356          } else {
 357             CMP(null_reg_d(), src0, src1, mod);
 358             set_predicate(BRW_PREDICATE_NORMAL,
 359                           SEL(dst, src0, src1));
 360          }
 361       }
 362
 363       /**
 364        * Copy any live channel from \p src to the first channel of the result.
 365        */
 366       src_reg
 367       emit_uniformize(const src_reg &src) const
 368       {
 369          const fs_builder ubld = exec_all();
 370          const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
 371          const dst_reg dst = component(vgrf(src.type), 0);
 372
 373          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 374          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
 375
 376          return src_reg(dst);
 377       }
 378
 379       /**
 380        * Assorted arithmetic ops.
 381        * @{
 382        */
 383 #define ALU1(op)                                        \
 384       instruction *                                     \
 385       op(const dst_reg &dst, const src_reg &src0) const \
 386       {                                                 \
 387          return emit(BRW_OPCODE_##op, dst, src0);       \
 388       }
 389
 390 #define ALU2(op)                                                        \
 391       instruction *                                                     \
 392       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 393       {                                                                 \
 394          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 395       }
 396
 397 #define ALU2_ACC(op)                                                    \
 398       instruction *                                                     \
 399       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 400       {                                                                 \
 401          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 402          inst->writes_accumulator = true;                               \
 403          return inst;                                                   \
 404       }
 405
 406 #define ALU3(op)                                                        \
 407       instruction *                                                     \
 408       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 409          const src_reg &src2) const                                     \
 410       {                                                                 \
 411          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 412       }
 413
 414       ALU2(ADD)
 415       ALU2_ACC(ADDC)
 416       ALU2(AND)
 417       ALU2(ASR)
 418       ALU2(AVG)
 419       ALU3(BFE)
 420       ALU2(BFI1)
 421       ALU3(BFI2)
 422       ALU1(BFREV)
 423       ALU1(CBIT)
 424       ALU2(CMPN)
 425       ALU3(CSEL)
 426       ALU2(DP2)
 427       ALU2(DP3)
 428       ALU2(DP4)
 429       ALU2(DPH)
 430       ALU1(F16TO32)
 431       ALU1(F32TO16)
 432       ALU1(FBH)
 433       ALU1(FBL)
 434       ALU1(FRC)
 435       ALU2(LINE)
 436       ALU1(LZD)
 437       ALU2(MAC)
 438       ALU2_ACC(MACH)
 439       ALU3(MAD)
 440       ALU1(MOV)
 441       ALU2(MUL)
 442       ALU1(NOT)
 443       ALU2(OR)
 444       ALU2(PLN)
 445       ALU1(RNDD)
 446       ALU1(RNDE)
 447       ALU1(RNDU)
 448       ALU1(RNDZ)
 449       ALU2(SAD2)
 450       ALU2_ACC(SADA2)
 451       ALU2(SEL)
 452       ALU2(SHL)
 453       ALU2(SHR)
 454       ALU2_ACC(SUBB)
 455       ALU2(XOR)
 456
 457 #undef ALU3
 458 #undef ALU2_ACC
 459 #undef ALU2
 460 #undef ALU1
 461       /** @} */
 462
 463       /**
 464        * CMP: Sets the low bit of the destination channels with the result
 465        * of the comparison, while the upper bits are undefined, and updates
 466        * the flag register with the packed 16 bits of the result.
 467        */
 468       instruction *
 469       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 470           brw_conditional_mod condition) const
 471       {
 472          /* Take the instruction:
 473           *
 474           * CMP null<d> src0<f> src1<f>
 475           *
 476           * Original gen4 does type conversion to the destination type
 477           * before comparison, producing garbage results for floating
 478           * point comparisons.
 479           *
 480           * The destination type doesn't matter on newer generations,
 481           * so we set the type to match src0 so we can compact the
 482           * instruction.
 483           */
 484          return set_condmod(condition,
 485                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 486                                  fix_unsigned_negate(src0),
 487                                  fix_unsigned_negate(src1)));
 488       }
 489
 490       /**
 491        * Gen4 predicated IF.
 492        */
 493       instruction *
 494       IF(brw_predicate predicate) const
 495       {
 496          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 497       }
 498
 499       /**
 500        * Emit a linear interpolation instruction.
 501        */
 502       instruction *
 503       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 504           const src_reg &a) const
 505       {
 506          if (shader->devinfo->gen >= 6) {
 507             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 508              * we need to reorder the operands.
 509              */
 510             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 511
 512          } else {
 513             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 514             const dst_reg y_times_a = vgrf(dst.type);
 515             const dst_reg one_minus_a = vgrf(dst.type);
 516             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 517
 518             MUL(y_times_a, y, a);
 519             ADD(one_minus_a, negate(a), src_reg(1.0f));
 520             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 521             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 522          }
 523       }
 524
 525       /**
 526        * Collect a number of registers in a contiguous range of registers.
 527        */
 528       instruction *
 529       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
 530                    unsigned sources, unsigned header_size) const
 531       {
 532          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
 533          inst->header_size = header_size;
 534          inst->regs_written = header_size +
 535                               (sources - header_size) * (dispatch_width() / 8);
 536
 537          return inst;
 538       }
 539
 540       backend_shader *shader;
 541
 542    private:
 543       /**
 544        * Workaround for negation of UD registers.  See comment in
 545        * fs_generator::generate_code() for more details.
 546        */
 547       src_reg
 548       fix_unsigned_negate(const src_reg &src) const
 549       {
 550          if (src.type == BRW_REGISTER_TYPE_UD &&
 551              src.negate) {
 552             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 553             MOV(temp, src);
 554             return src_reg(temp);
 555          } else {
 556             return src;
 557          }
 558       }
 559
 560       /**
 561        * Workaround for source register modes not supported by the ternary
 562        * instruction encoding.
 563        */
 564       src_reg
 565       fix_3src_operand(const src_reg &src) const
 566       {
 567          if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
 568             return src;
 569          } else {
 570             dst_reg expanded = vgrf(src.type);
 571             MOV(expanded, src);
 572             return expanded;
 573          }
 574       }
 575
 576       /**
 577        * Workaround for source register modes not supported by the math
 578        * instruction.
 579        */
 580       src_reg
 581       fix_math_operand(const src_reg &src) const
 582       {
 583          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
 584           * might be able to do better by doing execsize = 1 math and then
 585           * expanding that result out, but we would need to be careful with
 586           * masking.
 587           *
 588           * Gen6 hardware ignores source modifiers (negate and abs) on math
 589           * instructions, so we also move to a temp to set those up.
 590           *
 591           * Gen7 relaxes most of the above restrictions, but still can't use IMM
 592           * operands to math
 593           */
 594          if ((shader->devinfo->gen == 6 &&
 595               (src.file == IMM || src.file == UNIFORM ||
 596                src.abs || src.negate)) ||
 597              (shader->devinfo->gen == 7 && src.file == IMM)) {
 598             const dst_reg tmp = vgrf(src.type);
 599             MOV(tmp, src);
 600             return tmp;
 601          } else {
 602             return src;
 603          }
 604       }
 605
 606       /**
 607        * Workaround other weirdness of the math instruction.
 608        */
 609       instruction *
 610       fix_math_instruction(instruction *inst) const
 611       {
 612          if (shader->devinfo->gen < 6) {
 613             inst->base_mrf = 2;
 614             inst->mlen = inst->sources * dispatch_width() / 8;
 615
 616             if (inst->sources > 1) {
 617                /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 618                 * "Message Payload":
 619                 *
 620                 * "Operand0[7].  For the INT DIV functions, this operand is the
 621                 *  denominator."
 622                 *  ...
 623                 * "Operand1[7].  For the INT DIV functions, this operand is the
 624                 *  numerator."
 625                 */
 626                const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
 627                const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
 628                const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
 629
 630                inst->resize_sources(1);
 631                inst->src[0] = src0;
 632
 633                at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type),
 634                                    src1);
 635             }
 636          }
 637
 638          return inst;
 639       }
 640
 641       bblock_t *block;
 642       exec_node *cursor;
 643
 644       unsigned _dispatch_width;
 645       unsigned _group;
 646       bool force_writemask_all;
 647
 648       /** Debug annotation info. */
 649       struct {
 650          const char *str;
 651          const void *ir;
 652       } annotation;
 653    };
 654 }
 655
 656 #endif