src/intel/compiler/brw_vec4_builder.h

   1 /* -*- c++ -*- */
   2 /*
   3  * Copyright © 2010-2015 Intel Corporation
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef BRW_VEC4_BUILDER_H
  26 #define BRW_VEC4_BUILDER_H
  27
  28 #include "brw_ir_vec4.h"
  29 #include "brw_ir_allocator.h"
  30
  31 namespace brw {
  32    /**
  33     * Toolbox to assemble a VEC4 IR program out of individual instructions.
  34     *
  35     * This object is meant to have an interface consistent with
  36     * brw::fs_builder.  They cannot be fully interchangeable because
  37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
  38     * vector code.
  39     */
  40    class vec4_builder {
  41    public:
  42       /** Type used in this IR to represent a source of an instruction. */
  43       typedef brw::src_reg src_reg;
  44
  45       /** Type used in this IR to represent the destination of an instruction. */
  46       typedef brw::dst_reg dst_reg;
  47
  48       /** Type used in this IR to represent an instruction. */
  49       typedef vec4_instruction instruction;
  50
  51       /**
  52        * Construct a vec4_builder that inserts instructions into \p shader.
  53        */
  54       vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
  55          shader(shader), block(NULL), cursor(NULL),
  56          _dispatch_width(dispatch_width), _group(0),
  57          force_writemask_all(false),
  58          annotation()
  59       {
  60       }
  61
  62       /**
  63        * Construct a vec4_builder that inserts instructions into \p shader
  64        * before instruction \p inst in basic block \p block.  The default
  65        * execution controls and debug annotation are initialized from the
  66        * instruction passed as argument.
  67        */
  68       vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
  69          shader(shader), block(block), cursor(inst),
  70          _dispatch_width(inst->exec_size), _group(inst->group),
  71          force_writemask_all(inst->force_writemask_all)
  72       {
  73          annotation.str = inst->annotation;
  74          annotation.ir = inst->ir;
  75       }
  76
  77       /**
  78        * Construct a vec4_builder that inserts instructions before \p cursor
  79        * in basic block \p block, inheriting other code generation parameters
  80        * from this.
  81        */
  82       vec4_builder
  83       at(bblock_t *block, exec_node *cursor) const
  84       {
  85          vec4_builder bld = *this;
  86          bld.block = block;
  87          bld.cursor = cursor;
  88          return bld;
  89       }
  90
  91       /**
  92        * Construct a vec4_builder appending instructions at the end of the
  93        * instruction list of the shader, inheriting other code generation
  94        * parameters from this.
  95        */
  96       vec4_builder
  97       at_end() const
  98       {
  99          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
 100       }
 101
 102       /**
 103        * Construct a builder specifying the default SIMD width and group of
 104        * channel enable signals, inheriting other code generation parameters
 105        * from this.
 106        *
 107        * \p n gives the default SIMD width, \p i gives the slot group used for
 108        * predication and control flow masking in multiples of \p n channels.
 109        */
 110       vec4_builder
 111       group(unsigned n, unsigned i) const
 112       {
 113          assert(force_writemask_all ||
 114                 (n <= dispatch_width() && i < dispatch_width() / n));
 115          vec4_builder bld = *this;
 116          bld._dispatch_width = n;
 117          bld._group += i * n;
 118          return bld;
 119       }
 120
 121       /**
 122        * Construct a builder with per-channel control flow execution masking
 123        * disabled if \p b is true.  If control flow execution masking is
 124        * already disabled this has no effect.
 125        */
 126       vec4_builder
 127       exec_all(bool b = true) const
 128       {
 129          vec4_builder bld = *this;
 130          if (b)
 131             bld.force_writemask_all = true;
 132          return bld;
 133       }
 134
 135       /**
 136        * Construct a builder with the given debug annotation info.
 137        */
 138       vec4_builder
 139       annotate(const char *str, const void *ir = NULL) const
 140       {
 141          vec4_builder bld = *this;
 142          bld.annotation.str = str;
 143          bld.annotation.ir = ir;
 144          return bld;
 145       }
 146
 147       /**
 148        * Get the SIMD width in use.
 149        */
 150       unsigned
 151       dispatch_width() const
 152       {
 153          return _dispatch_width;
 154       }
 155
 156       /**
 157        * Get the channel group in use.
 158        */
 159       unsigned
 160       group() const
 161       {
 162          return _group;
 163       }
 164
 165       /**
 166        * Allocate a virtual register of natural vector size (four for this IR)
 167        * and SIMD width.  \p n gives the amount of space to allocate in
 168        * dispatch_width units (which is just enough space for four logical
 169        * components in this IR).
 170        */
 171       dst_reg
 172       vgrf(enum brw_reg_type type, unsigned n = 1) const
 173       {
 174          assert(dispatch_width() <= 32);
 175
 176          if (n > 0)
 177             return retype(dst_reg(VGRF, shader->alloc.allocate(
 178                                      n * DIV_ROUND_UP(type_sz(type), 4))),
 179                            type);
 180          else
 181             return retype(null_reg_ud(), type);
 182       }
 183
 184       /**
 185        * Create a null register of floating type.
 186        */
 187       dst_reg
 188       null_reg_f() const
 189       {
 190          return dst_reg(retype(brw_null_vec(dispatch_width()),
 191                                BRW_REGISTER_TYPE_F));
 192       }
 193
 194       /**
 195        * Create a null register of signed integer type.
 196        */
 197       dst_reg
 198       null_reg_d() const
 199       {
 200          return dst_reg(retype(brw_null_vec(dispatch_width()),
 201                                BRW_REGISTER_TYPE_D));
 202       }
 203
 204       /**
 205        * Create a null register of unsigned integer type.
 206        */
 207       dst_reg
 208       null_reg_ud() const
 209       {
 210          return dst_reg(retype(brw_null_vec(dispatch_width()),
 211                                BRW_REGISTER_TYPE_UD));
 212       }
 213
 214       /**
 215        * Insert an instruction into the program.
 216        */
 217       instruction *
 218       emit(const instruction &inst) const
 219       {
 220          return emit(new(shader->mem_ctx) instruction(inst));
 221       }
 222
 223       /**
 224        * Create and insert a nullary control instruction into the program.
 225        */
 226       instruction *
 227       emit(enum opcode opcode) const
 228       {
 229          return emit(instruction(opcode));
 230       }
 231
 232       /**
 233        * Create and insert a nullary instruction into the program.
 234        */
 235       instruction *
 236       emit(enum opcode opcode, const dst_reg &dst) const
 237       {
 238          return emit(instruction(opcode, dst));
 239       }
 240
 241       /**
 242        * Create and insert a unary instruction into the program.
 243        */
 244       instruction *
 245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
 246       {
 247          switch (opcode) {
 248          case SHADER_OPCODE_RCP:
 249          case SHADER_OPCODE_RSQ:
 250          case SHADER_OPCODE_SQRT:
 251          case SHADER_OPCODE_EXP2:
 252          case SHADER_OPCODE_LOG2:
 253          case SHADER_OPCODE_SIN:
 254          case SHADER_OPCODE_COS:
 255             return fix_math_instruction(
 256                emit(instruction(opcode, dst,
 257                                 fix_math_operand(src0))));
 258
 259          default:
 260             return emit(instruction(opcode, dst, src0));
 261          }
 262       }
 263
 264       /**
 265        * Create and insert a binary instruction into the program.
 266        */
 267       instruction *
 268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 269            const src_reg &src1) const
 270       {
 271          switch (opcode) {
 272          case SHADER_OPCODE_POW:
 273          case SHADER_OPCODE_INT_QUOTIENT:
 274          case SHADER_OPCODE_INT_REMAINDER:
 275             return fix_math_instruction(
 276                emit(instruction(opcode, dst,
 277                                 fix_math_operand(src0),
 278                                 fix_math_operand(src1))));
 279
 280          default:
 281             return emit(instruction(opcode, dst, src0, src1));
 282          }
 283       }
 284
 285       /**
 286        * Create and insert a ternary instruction into the program.
 287        */
 288       instruction *
 289       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
 290            const src_reg &src1, const src_reg &src2) const
 291       {
 292          switch (opcode) {
 293          case BRW_OPCODE_BFE:
 294          case BRW_OPCODE_BFI2:
 295          case BRW_OPCODE_MAD:
 296          case BRW_OPCODE_LRP:
 297             return emit(instruction(opcode, dst,
 298                                     fix_3src_operand(src0),
 299                                     fix_3src_operand(src1),
 300                                     fix_3src_operand(src2)));
 301
 302          default:
 303             return emit(instruction(opcode, dst, src0, src1, src2));
 304          }
 305       }
 306
 307       /**
 308        * Insert a preallocated instruction into the program.
 309        */
 310       instruction *
 311       emit(instruction *inst) const
 312       {
 313          inst->exec_size = dispatch_width();
 314          inst->group = group();
 315          inst->force_writemask_all = force_writemask_all;
 316          inst->size_written = inst->exec_size * type_sz(inst->dst.type);
 317          inst->annotation = annotation.str;
 318          inst->ir = annotation.ir;
 319
 320          if (block)
 321             static_cast<instruction *>(cursor)->insert_before(block, inst);
 322          else
 323             cursor->insert_before(inst);
 324
 325          return inst;
 326       }
 327
 328       /**
 329        * Select \p src0 if the comparison of both sources with the given
 330        * conditional mod evaluates to true, otherwise select \p src1.
 331        *
 332        * Generally useful to get the minimum or maximum of two values.
 333        */
 334       instruction *
 335       emit_minmax(const dst_reg &dst, const src_reg &src0,
 336                   const src_reg &src1, brw_conditional_mod mod) const
 337       {
 338          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
 339
 340          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
 341                                      fix_unsigned_negate(src1)));
 342       }
 343
 344       /**
 345        * Copy any live channel from \p src to the first channel of the result.
 346        */
 347       src_reg
 348       emit_uniformize(const src_reg &src) const
 349       {
 350          const vec4_builder ubld = exec_all();
 351          const dst_reg chan_index =
 352             writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
 353          const dst_reg dst = vgrf(src.type);
 354
 355          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
 356          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
 357
 358          return src_reg(dst);
 359       }
 360
 361       /**
 362        * Assorted arithmetic ops.
 363        * @{
 364        */
 365 #define ALU1(op)                                        \
 366       instruction *                                     \
 367       op(const dst_reg &dst, const src_reg &src0) const \
 368       {                                                 \
 369          return emit(BRW_OPCODE_##op, dst, src0);       \
 370       }
 371
 372 #define ALU2(op)                                                        \
 373       instruction *                                                     \
 374       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 375       {                                                                 \
 376          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
 377       }
 378
 379 #define ALU2_ACC(op)                                                    \
 380       instruction *                                                     \
 381       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
 382       {                                                                 \
 383          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
 384          inst->writes_accumulator = true;                               \
 385          return inst;                                                   \
 386       }
 387
 388 #define ALU3(op)                                                        \
 389       instruction *                                                     \
 390       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
 391          const src_reg &src2) const                                     \
 392       {                                                                 \
 393          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
 394       }
 395
 396       ALU2(ADD)
 397       ALU2_ACC(ADDC)
 398       ALU2(AND)
 399       ALU2(ASR)
 400       ALU2(AVG)
 401       ALU3(BFE)
 402       ALU2(BFI1)
 403       ALU3(BFI2)
 404       ALU1(BFREV)
 405       ALU1(CBIT)
 406       ALU2(CMPN)
 407       ALU3(CSEL)
 408       ALU1(DIM)
 409       ALU2(DP2)
 410       ALU2(DP3)
 411       ALU2(DP4)
 412       ALU2(DPH)
 413       ALU1(F16TO32)
 414       ALU1(F32TO16)
 415       ALU1(FBH)
 416       ALU1(FBL)
 417       ALU1(FRC)
 418       ALU2(LINE)
 419       ALU1(LZD)
 420       ALU2(MAC)
 421       ALU2_ACC(MACH)
 422       ALU3(MAD)
 423       ALU1(MOV)
 424       ALU2(MUL)
 425       ALU1(NOT)
 426       ALU2(OR)
 427       ALU2(PLN)
 428       ALU1(RNDD)
 429       ALU1(RNDE)
 430       ALU1(RNDU)
 431       ALU1(RNDZ)
 432       ALU2(SAD2)
 433       ALU2_ACC(SADA2)
 434       ALU2(SEL)
 435       ALU2(SHL)
 436       ALU2(SHR)
 437       ALU2_ACC(SUBB)
 438       ALU2(XOR)
 439
 440 #undef ALU3
 441 #undef ALU2_ACC
 442 #undef ALU2
 443 #undef ALU1
 444       /** @} */
 445
 446       /**
 447        * CMP: Sets the low bit of the destination channels with the result
 448        * of the comparison, while the upper bits are undefined, and updates
 449        * the flag register with the packed 16 bits of the result.
 450        */
 451       instruction *
 452       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
 453           brw_conditional_mod condition) const
 454       {
 455          /* Take the instruction:
 456           *
 457           * CMP null<d> src0<f> src1<f>
 458           *
 459           * Original gen4 does type conversion to the destination type
 460           * before comparison, producing garbage results for floating
 461           * point comparisons.
 462           *
 463           * The destination type doesn't matter on newer generations,
 464           * so we set the type to match src0 so we can compact the
 465           * instruction.
 466           */
 467          return set_condmod(condition,
 468                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
 469                                  fix_unsigned_negate(src0),
 470                                  fix_unsigned_negate(src1)));
 471       }
 472
 473       /**
 474        * Gen4 predicated IF.
 475        */
 476       instruction *
 477       IF(brw_predicate predicate) const
 478       {
 479          return set_predicate(predicate, emit(BRW_OPCODE_IF));
 480       }
 481
 482       /**
 483        * Gen6 IF with embedded comparison.
 484        */
 485       instruction *
 486       IF(const src_reg &src0, const src_reg &src1,
 487          brw_conditional_mod condition) const
 488       {
 489          assert(shader->devinfo->gen == 6);
 490          return set_condmod(condition,
 491                             emit(BRW_OPCODE_IF,
 492                                  null_reg_d(),
 493                                  fix_unsigned_negate(src0),
 494                                  fix_unsigned_negate(src1)));
 495       }
 496
 497       /**
 498        * Emit a linear interpolation instruction.
 499        */
 500       instruction *
 501       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
 502           const src_reg &a) const
 503       {
 504          if (shader->devinfo->gen >= 6) {
 505             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 506              * we need to reorder the operands.
 507              */
 508             return emit(BRW_OPCODE_LRP, dst, a, y, x);
 509
 510          } else {
 511             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 512             const dst_reg y_times_a = vgrf(dst.type);
 513             const dst_reg one_minus_a = vgrf(dst.type);
 514             const dst_reg x_times_one_minus_a = vgrf(dst.type);
 515
 516             MUL(y_times_a, y, a);
 517             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
 518             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
 519             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
 520          }
 521       }
 522
 523       backend_shader *shader;
 524
 525    protected:
 526       /**
 527        * Workaround for negation of UD registers.  See comment in
 528        * fs_generator::generate_code() for the details.
 529        */
 530       src_reg
 531       fix_unsigned_negate(const src_reg &src) const
 532       {
 533          if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
 534             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
 535             MOV(temp, src);
 536             return src_reg(temp);
 537          } else {
 538             return src;
 539          }
 540       }
 541
 542       /**
 543        * Workaround for register access modes not supported by the ternary
 544        * instruction encoding.
 545        */
 546       src_reg
 547       fix_3src_operand(const src_reg &src) const
 548       {
 549          /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
 550           * able to use vertical stride of zero to replicate the vec4 uniform, like
 551           *
 552           *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
 553           *
 554           * But you can't, since vertical stride is always four in three-source
 555           * instructions. Instead, insert a MOV instruction to do the replication so
 556           * that the three-source instruction can consume it.
 557           */
 558
 559          /* The MOV is only needed if the source is a uniform or immediate. */
 560          if (src.file != UNIFORM && src.file != IMM)
 561             return src;
 562
 563          if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
 564             return src;
 565
 566          const dst_reg expanded = vgrf(src.type);
 567          emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
 568          return src_reg(expanded);
 569       }
 570
 571       /**
 572        * Workaround for register access modes not supported by the math
 573        * instruction.
 574        */
 575       src_reg
 576       fix_math_operand(const src_reg &src) const
 577       {
 578          /* The gen6 math instruction ignores the source modifiers --
 579           * swizzle, abs, negate, and at least some parts of the register
 580           * region description.
 581           *
 582           * Rather than trying to enumerate all these cases, *always* expand the
 583           * operand to a temp GRF for gen6.
 584           *
 585           * For gen7, keep the operand as-is, except if immediate, which gen7 still
 586           * can't use.
 587           */
 588          if (shader->devinfo->gen == 6 ||
 589              (shader->devinfo->gen == 7 && src.file == IMM)) {
 590             const dst_reg tmp = vgrf(src.type);
 591             MOV(tmp, src);
 592             return src_reg(tmp);
 593          } else {
 594             return src;
 595          }
 596       }
 597
 598       /**
 599        * Workaround other weirdness of the math instruction.
 600        */
 601       instruction *
 602       fix_math_instruction(instruction *inst) const
 603       {
 604          if (shader->devinfo->gen == 6 &&
 605              inst->dst.writemask != WRITEMASK_XYZW) {
 606             const dst_reg tmp = vgrf(inst->dst.type);
 607             MOV(inst->dst, src_reg(tmp));
 608             inst->dst = tmp;
 609
 610          } else if (shader->devinfo->gen < 6) {
 611             const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
 612             inst->base_mrf = 1;
 613             inst->mlen = sources;
 614          }
 615
 616          return inst;
 617       }
 618
 619       bblock_t *block;
 620       exec_node *cursor;
 621
 622       unsigned _dispatch_width;
 623       unsigned _group;
 624       bool force_writemask_all;
 625
 626       /** Debug annotation info. */
 627       struct {
 628          const char *str;
 629          const void *ir;
 630       } annotation;
 631    };
 632 }
 633
 634 #endif