src/intel/compiler/brw_fs_lower_regioning.cpp

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_fs.h"
  25 #include "brw_cfg.h"
  26 #include "brw_fs_builder.h"
  27
  28 using namespace brw;
  29
  30 namespace {
  31    /* From the SKL PRM Vol 2a, "Move":
  32     *
  33     * "A mov with the same source and destination type, no source modifier,
  34     *  and no saturation is a raw move. A packed byte destination region (B
  35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
  36     *  using raw move."
  37     */
  38    bool
  39    is_byte_raw_mov(const fs_inst *inst)
  40    {
  41       return type_sz(inst->dst.type) == 1 &&
  42              inst->opcode == BRW_OPCODE_MOV &&
  43              inst->src[0].type == inst->dst.type &&
  44              !inst->saturate &&
  45              !inst->src[0].negate &&
  46              !inst->src[0].abs;
  47    }
  48
  49    /*
  50     * Return an acceptable byte stride for the destination of an instruction
  51     * that requires it to have some particular alignment.
  52     */
  53    unsigned
  54    required_dst_byte_stride(const fs_inst *inst)
  55    {
  56       if (inst->dst.is_accumulator()) {
  57          /* If the destination is an accumulator, insist that we leave the
  58           * stride alone.  We cannot "fix" accumulator destinations by writing
  59           * to a temporary and emitting a MOV into the original destination.
  60           * For multiply instructions (our one use of the accumulator), the
  61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
  62           * would emit only writes 33 bits and leaves the top 33 bits
  63           * undefined.
  64           *
  65           * It's safe to just require the original stride here because the
  66           * lowering pass will detect the mismatch in has_invalid_src_region
  67           * and fix the sources of the multiply instead of the destination.
  68           */
  69          return inst->dst.stride * type_sz(inst->dst.type);
  70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
  71           !is_byte_raw_mov(inst)) {
  72          return get_exec_type_size(inst);
  73       } else {
  74          /* Calculate the maximum byte stride and the minimum/maximum type
  75           * size across all source and destination operands we are required to
  76           * lower.
  77           */
  78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
  79          unsigned min_size = type_sz(inst->dst.type);
  80          unsigned max_size = type_sz(inst->dst.type);
  81
  82          for (unsigned i = 0; i < inst->sources; i++) {
  83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
  84                const unsigned size = type_sz(inst->src[i].type);
  85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
  86                min_size = MIN2(min_size, size);
  87                max_size = MAX2(max_size, size);
  88             }
  89          }
  90
  91          /* All operands involved in lowering need to fit in the calculated
  92           * stride.
  93           */
  94          assert(max_size <= 4 * min_size);
  95
  96          /* Attempt to use the largest byte stride among all present operands,
  97           * but never exceed a stride of 4 since that would lead to illegal
  98           * destination regions during lowering.
  99           */
 100          return MIN2(max_stride, 4 * min_size);
 101       }
 102    }
 103
 104    /*
 105     * Return an acceptable byte sub-register offset for the destination of an
 106     * instruction that requires it to be aligned to the sub-register offset of
 107     * the sources.
 108     */
 109    unsigned
 110    required_dst_byte_offset(const fs_inst *inst)
 111    {
 112       for (unsigned i = 0; i < inst->sources; i++) {
 113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
 114             if (reg_offset(inst->src[i]) % REG_SIZE !=
 115                 reg_offset(inst->dst) % REG_SIZE)
 116                return 0;
 117       }
 118
 119       return reg_offset(inst->dst) % REG_SIZE;
 120    }
 121
 122    /*
 123     * Return whether the instruction has an unsupported channel bit layout
 124     * specified for the i-th source region.
 125     */
 126    bool
 127    has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
 128                           unsigned i)
 129    {
 130       if (is_unordered(inst) || inst->is_control_source(i))
 131          return false;
 132
 133       /* Empirical testing shows that Broadwell has a bug affecting half-float
 134        * MAD instructions when any of its sources has a non-zero offset, such
 135        * as:
 136        *
 137        * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
 138        *
 139        * We used to generate code like this for SIMD8 executions where we
 140        * used to pack components Y and W of a vector at offset 16B of a SIMD
 141        * register. The problem doesn't occur if the stride of the source is 0.
 142        */
 143       if (devinfo->gen == 8 &&
 144           inst->opcode == BRW_OPCODE_MAD &&
 145           inst->src[i].type == BRW_REGISTER_TYPE_HF &&
 146           reg_offset(inst->src[i]) % REG_SIZE > 0 &&
 147           inst->src[i].stride != 0) {
 148          return true;
 149       }
 150
 151       const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 152       const unsigned src_byte_stride = inst->src[i].stride *
 153          type_sz(inst->src[i].type);
 154       const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 155       const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
 156
 157       return has_dst_aligned_region_restriction(devinfo, inst) &&
 158              !is_uniform(inst->src[i]) &&
 159              (src_byte_stride != dst_byte_stride ||
 160               src_byte_offset != dst_byte_offset);
 161    }
 162
 163    /*
 164     * Return whether the instruction has an unsupported channel bit layout
 165     * specified for the destination region.
 166     */
 167    bool
 168    has_invalid_dst_region(const gen_device_info *devinfo,
 169                           const fs_inst *inst)
 170    {
 171       if (is_unordered(inst)) {
 172          return false;
 173       } else {
 174          const brw_reg_type exec_type = get_exec_type(inst);
 175          const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 176          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 177          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
 178             type_sz(inst->dst.type) < type_sz(exec_type);
 179
 180          return (has_dst_aligned_region_restriction(devinfo, inst) &&
 181                  (required_dst_byte_stride(inst) != dst_byte_stride ||
 182                   required_dst_byte_offset(inst) != dst_byte_offset)) ||
 183                 (is_narrowing_conversion &&
 184                  required_dst_byte_stride(inst) != dst_byte_stride);
 185       }
 186    }
 187
 188    /*
 189     * Return whether the instruction has unsupported source modifiers
 190     * specified for the i-th source region.
 191     */
 192    bool
 193    has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
 194                              unsigned i)
 195    {
 196       return !inst->can_do_source_mods(devinfo) &&
 197              (inst->src[i].negate || inst->src[i].abs);
 198    }
 199
 200    /*
 201     * Return whether the instruction has an unsupported type conversion
 202     * specified for the destination.
 203     */
 204    bool
 205    has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
 206    {
 207       switch (inst->opcode) {
 208       case BRW_OPCODE_MOV:
 209          return false;
 210       case BRW_OPCODE_SEL:
 211          return inst->dst.type != get_exec_type(inst);
 212       case SHADER_OPCODE_BROADCAST:
 213       case SHADER_OPCODE_MOV_INDIRECT:
 214          /* The source and destination types of these may be hard-coded to
 215           * integer at codegen time due to hardware limitations of 64-bit
 216           * types.
 217           */
 218          return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
 219                  devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
 220                 type_sz(inst->src[0].type) > 4 &&
 221                 inst->dst.type != inst->src[0].type;
 222       default:
 223          /* FIXME: We assume the opcodes don't explicitly mentioned before
 224           * just work fine with arbitrary conversions.
 225           */
 226          return false;
 227       }
 228    }
 229
 230    /**
 231     * Return whether the instruction has non-standard semantics for the
 232     * conditional mod which don't cause the flag register to be updated with
 233     * the comparison result.
 234     */
 235    bool
 236    has_inconsistent_cmod(const fs_inst *inst)
 237    {
 238       return inst->opcode == BRW_OPCODE_SEL ||
 239              inst->opcode == BRW_OPCODE_CSEL ||
 240              inst->opcode == BRW_OPCODE_IF ||
 241              inst->opcode == BRW_OPCODE_WHILE;
 242    }
 243
 244    bool
 245    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
 246 }
 247
 248 namespace brw {
 249    /**
 250     * Remove any modifiers from the \p i-th source region of the instruction,
 251     * including negate, abs and any implicit type conversion to the execution
 252     * type.  Instead any source modifiers will be implemented as a separate
 253     * MOV instruction prior to the original instruction.
 254     */
 255    bool
 256    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 257    {
 258       assert(inst->components_read(i) == 1);
 259       const fs_builder ibld(v, block, inst);
 260       const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
 261
 262       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
 263       inst->src[i] = tmp;
 264
 265       return true;
 266    }
 267 }
 268
 269 namespace {
 270    /**
 271     * Remove any modifiers from the destination region of the instruction,
 272     * including saturate, conditional mod and any implicit type conversion
 273     * from the execution type.  Instead any destination modifiers will be
 274     * implemented as a separate MOV instruction after the original
 275     * instruction.
 276     */
 277    bool
 278    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
 279    {
 280       const fs_builder ibld(v, block, inst);
 281       const brw_reg_type type = get_exec_type(inst);
 282       /* Not strictly necessary, but if possible use a temporary with the same
 283        * channel alignment as the current destination in order to avoid
 284        * violating the restrictions enforced later on by lower_src_region()
 285        * and lower_dst_region(), which would introduce additional copy
 286        * instructions into the program unnecessarily.
 287        */
 288       const unsigned stride =
 289          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
 290          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
 291       fs_reg tmp = ibld.vgrf(type, stride);
 292       ibld.UNDEF(tmp);
 293       tmp = horiz_stride(tmp, stride);
 294
 295       /* Emit a MOV taking care of all the destination modifiers. */
 296       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
 297       mov->saturate = inst->saturate;
 298       if (!has_inconsistent_cmod(inst))
 299          mov->conditional_mod = inst->conditional_mod;
 300       if (inst->opcode != BRW_OPCODE_SEL) {
 301          mov->predicate = inst->predicate;
 302          mov->predicate_inverse = inst->predicate_inverse;
 303       }
 304       mov->flag_subreg = inst->flag_subreg;
 305       lower_instruction(v, block, mov);
 306
 307       /* Point the original instruction at the temporary, and clean up any
 308        * destination modifiers.
 309        */
 310       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 311       inst->dst = tmp;
 312       inst->size_written = inst->dst.component_size(inst->exec_size);
 313       inst->saturate = false;
 314       if (!has_inconsistent_cmod(inst))
 315          inst->conditional_mod = BRW_CONDITIONAL_NONE;
 316
 317       assert(!inst->flags_written() || !mov->predicate);
 318       return true;
 319    }
 320
 321    /**
 322     * Remove any non-trivial shuffling of data from the \p i-th source region
 323     * of the instruction.  Instead implement the region as a series of integer
 324     * copies into a temporary with the same channel layout as the destination.
 325     */
 326    bool
 327    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 328    {
 329       assert(inst->components_read(i) == 1);
 330       const fs_builder ibld(v, block, inst);
 331       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
 332                               type_sz(inst->src[i].type);
 333       assert(stride > 0);
 334       fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
 335       ibld.UNDEF(tmp);
 336       tmp = horiz_stride(tmp, stride);
 337
 338       /* Emit a series of 32-bit integer copies with any source modifiers
 339        * cleaned up (because their semantics are dependent on the type).
 340        */
 341       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 342                                                  false);
 343       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 344       fs_reg raw_src = inst->src[i];
 345       raw_src.negate = false;
 346       raw_src.abs = false;
 347
 348       for (unsigned j = 0; j < n; j++)
 349          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
 350
 351       /* Point the original instruction at the temporary, making sure to keep
 352        * any source modifiers in the instruction.
 353        */
 354       fs_reg lower_src = tmp;
 355       lower_src.negate = inst->src[i].negate;
 356       lower_src.abs = inst->src[i].abs;
 357       inst->src[i] = lower_src;
 358
 359       return true;
 360    }
 361
 362    /**
 363     * Remove any non-trivial shuffling of data from the destination region of
 364     * the instruction.  Instead implement the region as a series of integer
 365     * copies from a temporary with a channel layout compatible with the
 366     * sources.
 367     */
 368    bool
 369    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
 370    {
 371       /* We cannot replace the result of an integer multiply which writes the
 372        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
 373        * value whereas the MOV will act on only 32 or 33 bits of the
 374        * accumulator.
 375        */
 376       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
 377              brw_reg_type_is_floating_point(inst->dst.type));
 378
 379       const fs_builder ibld(v, block, inst);
 380       const unsigned stride = required_dst_byte_stride(inst) /
 381                               type_sz(inst->dst.type);
 382       assert(stride > 0);
 383       fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
 384       ibld.UNDEF(tmp);
 385       tmp = horiz_stride(tmp, stride);
 386
 387       /* Emit a series of 32-bit integer copies from the temporary into the
 388        * original destination.
 389        */
 390       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 391                                                  false);
 392       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 393
 394       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
 395          /* Note that in general we cannot simply predicate the copies on the
 396           * same flag register as the original instruction, since it may have
 397           * been overwritten by the instruction itself.  Instead initialize
 398           * the temporary with the previous contents of the destination
 399           * register.
 400           */
 401          for (unsigned j = 0; j < n; j++)
 402             ibld.MOV(subscript(tmp, raw_type, j),
 403                      subscript(inst->dst, raw_type, j));
 404       }
 405
 406       for (unsigned j = 0; j < n; j++)
 407          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
 408                                         subscript(tmp, raw_type, j));
 409
 410       /* Point the original instruction at the temporary, making sure to keep
 411        * any destination modifiers in the instruction.
 412        */
 413       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 414       inst->dst = tmp;
 415       inst->size_written = inst->dst.component_size(inst->exec_size);
 416
 417       return true;
 418    }
 419
 420    /**
 421     * Legalize the source and destination regioning controls of the specified
 422     * instruction.
 423     */
 424    bool
 425    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
 426    {
 427       const gen_device_info *devinfo = v->devinfo;
 428       bool progress = false;
 429
 430       if (has_invalid_conversion(devinfo, inst))
 431          progress |= lower_dst_modifiers(v, block, inst);
 432
 433       if (has_invalid_dst_region(devinfo, inst))
 434          progress |= lower_dst_region(v, block, inst);
 435
 436       for (unsigned i = 0; i < inst->sources; i++) {
 437          if (has_invalid_src_modifiers(devinfo, inst, i))
 438             progress |= lower_src_modifiers(v, block, inst, i);
 439
 440          if (has_invalid_src_region(devinfo, inst, i))
 441             progress |= lower_src_region(v, block, inst, i);
 442       }
 443
 444       return progress;
 445    }
 446 }
 447
 448 bool
 449 fs_visitor::lower_regioning()
 450 {
 451    bool progress = false;
 452
 453    foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
 454       progress |= lower_instruction(this, block, inst);
 455
 456    if (progress)
 457       invalidate_live_intervals();
 458
 459    return progress;
 460 }