src/intel/compiler/brw_fs_lower_regioning.cpp

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_fs.h"
  25 #include "brw_cfg.h"
  26 #include "brw_fs_builder.h"
  27
  28 using namespace brw;
  29
  30 namespace {
  31    /* From the SKL PRM Vol 2a, "Move":
  32     *
  33     * "A mov with the same source and destination type, no source modifier,
  34     *  and no saturation is a raw move. A packed byte destination region (B
  35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
  36     *  using raw move."
  37     */
  38    bool
  39    is_byte_raw_mov(const fs_inst *inst)
  40    {
  41       return type_sz(inst->dst.type) == 1 &&
  42              inst->opcode == BRW_OPCODE_MOV &&
  43              inst->src[0].type == inst->dst.type &&
  44              !inst->saturate &&
  45              !inst->src[0].negate &&
  46              !inst->src[0].abs;
  47    }
  48
  49    /*
  50     * Return an acceptable byte stride for the destination of an instruction
  51     * that requires it to have some particular alignment.
  52     */
  53    unsigned
  54    required_dst_byte_stride(const fs_inst *inst)
  55    {
  56       if (inst->dst.is_accumulator()) {
  57          /* If the destination is an accumulator, insist that we leave the
  58           * stride alone.  We cannot "fix" accumulator destinations by writing
  59           * to a temporary and emitting a MOV into the original destination.
  60           * For multiply instructions (our one use of the accumulator), the
  61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
  62           * would emit only writes 33 bits and leaves the top 33 bits
  63           * undefined.
  64           *
  65           * It's safe to just require the original stride here because the
  66           * lowering pass will detect the mismatch in has_invalid_src_region
  67           * and fix the sources of the multiply instead of the destination.
  68           */
  69          return inst->dst.stride * type_sz(inst->dst.type);
  70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
  71           !is_byte_raw_mov(inst)) {
  72          return get_exec_type_size(inst);
  73       } else {
  74          /* Calculate the maximum byte stride and the minimum/maximum type
  75           * size across all source and destination operands we are required to
  76           * lower.
  77           */
  78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
  79          unsigned min_size = type_sz(inst->dst.type);
  80          unsigned max_size = type_sz(inst->dst.type);
  81
  82          for (unsigned i = 0; i < inst->sources; i++) {
  83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
  84                const unsigned size = type_sz(inst->src[i].type);
  85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
  86                min_size = MIN2(min_size, size);
  87                max_size = MAX2(max_size, size);
  88             }
  89          }
  90
  91          /* All operands involved in lowering need to fit in the calculated
  92           * stride.
  93           */
  94          assert(max_size <= 4 * min_size);
  95
  96          /* Attempt to use the largest byte stride among all present operands,
  97           * but never exceed a stride of 4 since that would lead to illegal
  98           * destination regions during lowering.
  99           */
 100          return MIN2(max_stride, 4 * min_size);
 101       }
 102    }
 103
 104    /*
 105     * Return an acceptable byte sub-register offset for the destination of an
 106     * instruction that requires it to be aligned to the sub-register offset of
 107     * the sources.
 108     */
 109    unsigned
 110    required_dst_byte_offset(const fs_inst *inst)
 111    {
 112       for (unsigned i = 0; i < inst->sources; i++) {
 113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
 114             if (reg_offset(inst->src[i]) % REG_SIZE !=
 115                 reg_offset(inst->dst) % REG_SIZE)
 116                return 0;
 117       }
 118
 119       return reg_offset(inst->dst) % REG_SIZE;
 120    }
 121
 122    /*
 123     * Return whether the instruction has an unsupported channel bit layout
 124     * specified for the i-th source region.
 125     */
 126    bool
 127    has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
 128                           unsigned i)
 129    {
 130       if (is_unordered(inst) || inst->is_control_source(i))
 131          return false;
 132
 133       /* Empirical testing shows that Broadwell has a bug affecting half-float
 134        * MAD instructions when any of its sources has a non-zero offset, such
 135        * as:
 136        *
 137        * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
 138        *
 139        * We used to generate code like this for SIMD8 executions where we
 140        * used to pack components Y and W of a vector at offset 16B of a SIMD
 141        * register. The problem doesn't occur if the stride of the source is 0.
 142        */
 143       if (devinfo->gen == 8 &&
 144           inst->opcode == BRW_OPCODE_MAD &&
 145           inst->src[i].type == BRW_REGISTER_TYPE_HF &&
 146           reg_offset(inst->src[i]) % REG_SIZE > 0 &&
 147           inst->src[i].stride != 0) {
 148          return true;
 149       }
 150
 151       const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 152       const unsigned src_byte_stride = inst->src[i].stride *
 153          type_sz(inst->src[i].type);
 154       const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 155       const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
 156
 157       return has_dst_aligned_region_restriction(devinfo, inst) &&
 158              !is_uniform(inst->src[i]) &&
 159              (src_byte_stride != dst_byte_stride ||
 160               src_byte_offset != dst_byte_offset);
 161    }
 162
 163    /*
 164     * Return whether the instruction has an unsupported channel bit layout
 165     * specified for the destination region.
 166     */
 167    bool
 168    has_invalid_dst_region(const gen_device_info *devinfo,
 169                           const fs_inst *inst)
 170    {
 171       if (is_unordered(inst)) {
 172          return false;
 173       } else {
 174          const brw_reg_type exec_type = get_exec_type(inst);
 175          const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 176          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 177          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
 178             type_sz(inst->dst.type) < type_sz(exec_type);
 179
 180          return (has_dst_aligned_region_restriction(devinfo, inst) &&
 181                  (required_dst_byte_stride(inst) != dst_byte_stride ||
 182                   required_dst_byte_offset(inst) != dst_byte_offset)) ||
 183                 (is_narrowing_conversion &&
 184                  required_dst_byte_stride(inst) != dst_byte_stride);
 185       }
 186    }
 187
 188    /*
 189     * Return whether the instruction has unsupported source modifiers
 190     * specified for the i-th source region.
 191     */
 192    bool
 193    has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
 194                              unsigned i)
 195    {
 196       return !inst->can_do_source_mods(devinfo) &&
 197              (inst->src[i].negate || inst->src[i].abs);
 198    }
 199
 200    /*
 201     * Return whether the instruction has an unsupported type conversion
 202     * specified for the destination.
 203     */
 204    bool
 205    has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
 206    {
 207       switch (inst->opcode) {
 208       case BRW_OPCODE_MOV:
 209          return false;
 210       case BRW_OPCODE_SEL:
 211          return inst->dst.type != get_exec_type(inst);
 212       case SHADER_OPCODE_BROADCAST:
 213       case SHADER_OPCODE_MOV_INDIRECT:
 214          /* The source and destination types of these may be hard-coded to
 215           * integer at codegen time due to hardware limitations of 64-bit
 216           * types.
 217           */
 218          return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
 219                  devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
 220                 type_sz(inst->src[0].type) > 4 &&
 221                 inst->dst.type != inst->src[0].type;
 222       default:
 223          /* FIXME: We assume the opcodes don't explicitly mentioned before
 224           * just work fine with arbitrary conversions.
 225           */
 226          return false;
 227       }
 228    }
 229
 230    /**
 231     * Return whether the instruction has non-standard semantics for the
 232     * conditional mod which don't cause the flag register to be updated with
 233     * the comparison result.
 234     */
 235    bool
 236    has_inconsistent_cmod(const fs_inst *inst)
 237    {
 238       return inst->opcode == BRW_OPCODE_SEL ||
 239              inst->opcode == BRW_OPCODE_CSEL ||
 240              inst->opcode == BRW_OPCODE_IF ||
 241              inst->opcode == BRW_OPCODE_WHILE;
 242    }
 243
 244    bool
 245    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
 246 }
 247
 248 namespace brw {
 249    /**
 250     * Remove any modifiers from the \p i-th source region of the instruction,
 251     * including negate, abs and any implicit type conversion to the execution
 252     * type.  Instead any source modifiers will be implemented as a separate
 253     * MOV instruction prior to the original instruction.
 254     */
 255    bool
 256    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 257    {
 258       assert(inst->components_read(i) == 1);
 259       const fs_builder ibld(v, block, inst);
 260       const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
 261
 262       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
 263       inst->src[i] = tmp;
 264
 265       return true;
 266    }
 267 }
 268
 269 namespace {
 270    /**
 271     * Remove any modifiers from the destination region of the instruction,
 272     * including saturate, conditional mod and any implicit type conversion
 273     * from the execution type.  Instead any destination modifiers will be
 274     * implemented as a separate MOV instruction after the original
 275     * instruction.
 276     */
 277    bool
 278    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
 279    {
 280       const fs_builder ibld(v, block, inst);
 281       const brw_reg_type type = get_exec_type(inst);
 282       /* Not strictly necessary, but if possible use a temporary with the same
 283        * channel alignment as the current destination in order to avoid
 284        * violating the restrictions enforced later on by lower_src_region()
 285        * and lower_dst_region(), which would introduce additional copy
 286        * instructions into the program unnecessarily.
 287        */
 288       const unsigned stride =
 289          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
 290          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
 291       const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
 292
 293       /* Emit a MOV taking care of all the destination modifiers. */
 294       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
 295       mov->saturate = inst->saturate;
 296       if (!has_inconsistent_cmod(inst))
 297          mov->conditional_mod = inst->conditional_mod;
 298       if (inst->opcode != BRW_OPCODE_SEL) {
 299          mov->predicate = inst->predicate;
 300          mov->predicate_inverse = inst->predicate_inverse;
 301       }
 302       mov->flag_subreg = inst->flag_subreg;
 303       lower_instruction(v, block, mov);
 304
 305       /* Point the original instruction at the temporary, and clean up any
 306        * destination modifiers.
 307        */
 308       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 309       inst->dst = tmp;
 310       inst->size_written = inst->dst.component_size(inst->exec_size);
 311       inst->saturate = false;
 312       if (!has_inconsistent_cmod(inst))
 313          inst->conditional_mod = BRW_CONDITIONAL_NONE;
 314
 315       assert(!inst->flags_written() || !mov->predicate);
 316       return true;
 317    }
 318
 319    /**
 320     * Remove any non-trivial shuffling of data from the \p i-th source region
 321     * of the instruction.  Instead implement the region as a series of integer
 322     * copies into a temporary with the same channel layout as the destination.
 323     */
 324    bool
 325    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 326    {
 327       assert(inst->components_read(i) == 1);
 328       const fs_builder ibld(v, block, inst);
 329       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
 330                               type_sz(inst->src[i].type);
 331       assert(stride > 0);
 332       const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
 333                                       stride);
 334
 335       /* Emit a series of 32-bit integer copies with any source modifiers
 336        * cleaned up (because their semantics are dependent on the type).
 337        */
 338       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 339                                                  false);
 340       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 341       fs_reg raw_src = inst->src[i];
 342       raw_src.negate = false;
 343       raw_src.abs = false;
 344
 345       for (unsigned j = 0; j < n; j++)
 346          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
 347
 348       /* Point the original instruction at the temporary, making sure to keep
 349        * any source modifiers in the instruction.
 350        */
 351       fs_reg lower_src = tmp;
 352       lower_src.negate = inst->src[i].negate;
 353       lower_src.abs = inst->src[i].abs;
 354       inst->src[i] = lower_src;
 355
 356       return true;
 357    }
 358
 359    /**
 360     * Remove any non-trivial shuffling of data from the destination region of
 361     * the instruction.  Instead implement the region as a series of integer
 362     * copies from a temporary with a channel layout compatible with the
 363     * sources.
 364     */
 365    bool
 366    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
 367    {
 368       /* We cannot replace the result of an integer multiply which writes the
 369        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
 370        * value whereas the MOV will act on only 32 or 33 bits of the
 371        * accumulator.
 372        */
 373       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
 374              brw_reg_type_is_floating_point(inst->dst.type));
 375
 376       const fs_builder ibld(v, block, inst);
 377       const unsigned stride = required_dst_byte_stride(inst) /
 378                               type_sz(inst->dst.type);
 379       assert(stride > 0);
 380       const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
 381                                       stride);
 382
 383       /* Emit a series of 32-bit integer copies from the temporary into the
 384        * original destination.
 385        */
 386       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 387                                                  false);
 388       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 389
 390       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
 391          /* Note that in general we cannot simply predicate the copies on the
 392           * same flag register as the original instruction, since it may have
 393           * been overwritten by the instruction itself.  Instead initialize
 394           * the temporary with the previous contents of the destination
 395           * register.
 396           */
 397          for (unsigned j = 0; j < n; j++)
 398             ibld.MOV(subscript(tmp, raw_type, j),
 399                      subscript(inst->dst, raw_type, j));
 400       }
 401
 402       for (unsigned j = 0; j < n; j++)
 403          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
 404                                         subscript(tmp, raw_type, j));
 405
 406       /* Point the original instruction at the temporary, making sure to keep
 407        * any destination modifiers in the instruction.
 408        */
 409       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 410       inst->dst = tmp;
 411       inst->size_written = inst->dst.component_size(inst->exec_size);
 412
 413       return true;
 414    }
 415
 416    /**
 417     * Legalize the source and destination regioning controls of the specified
 418     * instruction.
 419     */
 420    bool
 421    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
 422    {
 423       const gen_device_info *devinfo = v->devinfo;
 424       bool progress = false;
 425
 426       if (has_invalid_conversion(devinfo, inst))
 427          progress |= lower_dst_modifiers(v, block, inst);
 428
 429       if (has_invalid_dst_region(devinfo, inst))
 430          progress |= lower_dst_region(v, block, inst);
 431
 432       for (unsigned i = 0; i < inst->sources; i++) {
 433          if (has_invalid_src_modifiers(devinfo, inst, i))
 434             progress |= lower_src_modifiers(v, block, inst, i);
 435
 436          if (has_invalid_src_region(devinfo, inst, i))
 437             progress |= lower_src_region(v, block, inst, i);
 438       }
 439
 440       return progress;
 441    }
 442 }
 443
 444 bool
 445 fs_visitor::lower_regioning()
 446 {
 447    bool progress = false;
 448
 449    foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
 450       progress |= lower_instruction(this, block, inst);
 451
 452    if (progress)
 453       invalidate_live_intervals();
 454
 455    return progress;
 456 }