src/intel/compiler/brw_fs_lower_regioning.cpp

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_fs.h"
  25 #include "brw_cfg.h"
  26 #include "brw_fs_builder.h"
  27
  28 using namespace brw;
  29
  30 namespace {
  31    /* From the SKL PRM Vol 2a, "Move":
  32     *
  33     * "A mov with the same source and destination type, no source modifier,
  34     *  and no saturation is a raw move. A packed byte destination region (B
  35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
  36     *  using raw move."
  37     */
  38    bool
  39    is_byte_raw_mov(const fs_inst *inst)
  40    {
  41       return type_sz(inst->dst.type) == 1 &&
  42              inst->opcode == BRW_OPCODE_MOV &&
  43              inst->src[0].type == inst->dst.type &&
  44              !inst->saturate &&
  45              !inst->src[0].negate &&
  46              !inst->src[0].abs;
  47    }
  48
  49    /*
  50     * Return an acceptable byte stride for the destination of an instruction
  51     * that requires it to have some particular alignment.
  52     */
  53    unsigned
  54    required_dst_byte_stride(const fs_inst *inst)
  55    {
  56       if (inst->dst.is_accumulator()) {
  57          /* If the destination is an accumulator, insist that we leave the
  58           * stride alone.  We cannot "fix" accumulator destinations by writing
  59           * to a temporary and emitting a MOV into the original destination.
  60           * For multiply instructions (our one use of the accumulator), the
  61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
  62           * would emit only writes 33 bits and leaves the top 33 bits
  63           * undefined.
  64           *
  65           * It's safe to just require the original stride here because the
  66           * lowering pass will detect the mismatch in has_invalid_src_region
  67           * and fix the sources of the multiply instead of the destination.
  68           */
  69          return inst->dst.stride * type_sz(inst->dst.type);
  70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
  71           !is_byte_raw_mov(inst)) {
  72          return get_exec_type_size(inst);
  73       } else {
  74          /* Calculate the maximum byte stride and the minimum/maximum type
  75           * size across all source and destination operands we are required to
  76           * lower.
  77           */
  78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
  79          unsigned min_size = type_sz(inst->dst.type);
  80          unsigned max_size = type_sz(inst->dst.type);
  81
  82          for (unsigned i = 0; i < inst->sources; i++) {
  83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
  84                const unsigned size = type_sz(inst->src[i].type);
  85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
  86                min_size = MIN2(min_size, size);
  87                max_size = MAX2(max_size, size);
  88             }
  89          }
  90
  91          /* All operands involved in lowering need to fit in the calculated
  92           * stride.
  93           */
  94          assert(max_size <= 4 * min_size);
  95
  96          /* Attempt to use the largest byte stride among all present operands,
  97           * but never exceed a stride of 4 since that would lead to illegal
  98           * destination regions during lowering.
  99           */
 100          return MIN2(max_stride, 4 * min_size);
 101       }
 102    }
 103
 104    /*
 105     * Return an acceptable byte sub-register offset for the destination of an
 106     * instruction that requires it to be aligned to the sub-register offset of
 107     * the sources.
 108     */
 109    unsigned
 110    required_dst_byte_offset(const fs_inst *inst)
 111    {
 112       for (unsigned i = 0; i < inst->sources; i++) {
 113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
 114             if (reg_offset(inst->src[i]) % REG_SIZE !=
 115                 reg_offset(inst->dst) % REG_SIZE)
 116                return 0;
 117       }
 118
 119       return reg_offset(inst->dst) % REG_SIZE;
 120    }
 121
 122    /*
 123     * Return whether the instruction has an unsupported channel bit layout
 124     * specified for the i-th source region.
 125     */
 126    bool
 127    has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
 128                           unsigned i)
 129    {
 130       if (is_unordered(inst) || inst->is_control_source(i))
 131          return false;
 132
 133       /* Empirical testing shows that Broadwell has a bug affecting half-float
 134        * MAD instructions when any of its sources has a non-zero offset, such
 135        * as:
 136        *
 137        * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
 138        *
 139        * We used to generate code like this for SIMD8 executions where we
 140        * used to pack components Y and W of a vector at offset 16B of a SIMD
 141        * register. The problem doesn't occur if the stride of the source is 0.
 142        */
 143       if (devinfo->gen == 8 &&
 144           inst->opcode == BRW_OPCODE_MAD &&
 145           inst->src[i].type == BRW_REGISTER_TYPE_HF &&
 146           reg_offset(inst->src[i]) % REG_SIZE > 0 &&
 147           inst->src[i].stride != 0) {
 148          return true;
 149       }
 150
 151       const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 152       const unsigned src_byte_stride = inst->src[i].stride *
 153          type_sz(inst->src[i].type);
 154       const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 155       const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
 156
 157       return has_dst_aligned_region_restriction(devinfo, inst) &&
 158              !is_uniform(inst->src[i]) &&
 159              (src_byte_stride != dst_byte_stride ||
 160               src_byte_offset != dst_byte_offset);
 161    }
 162
 163    /*
 164     * Return whether the instruction has an unsupported channel bit layout
 165     * specified for the destination region.
 166     */
 167    bool
 168    has_invalid_dst_region(const gen_device_info *devinfo,
 169                           const fs_inst *inst)
 170    {
 171       if (is_unordered(inst)) {
 172          return false;
 173       } else {
 174          const brw_reg_type exec_type = get_exec_type(inst);
 175          const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
 176          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
 177          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
 178             type_sz(inst->dst.type) < type_sz(exec_type);
 179
 180          return (has_dst_aligned_region_restriction(devinfo, inst) &&
 181                  (required_dst_byte_stride(inst) != dst_byte_stride ||
 182                   required_dst_byte_offset(inst) != dst_byte_offset)) ||
 183                 (is_narrowing_conversion &&
 184                  required_dst_byte_stride(inst) != dst_byte_stride);
 185       }
 186    }
 187
 188    /*
 189     * Return whether the instruction has unsupported source modifiers
 190     * specified for the i-th source region.
 191     */
 192    bool
 193    has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
 194                              unsigned i)
 195    {
 196       return !inst->can_do_source_mods(devinfo) &&
 197              (inst->src[i].negate || inst->src[i].abs);
 198    }
 199
 200    /*
 201     * Return whether the instruction has an unsupported type conversion
 202     * specified for the destination.
 203     */
 204    bool
 205    has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
 206    {
 207       switch (inst->opcode) {
 208       case BRW_OPCODE_MOV:
 209          return false;
 210       case BRW_OPCODE_SEL:
 211          return inst->dst.type != get_exec_type(inst);
 212       case SHADER_OPCODE_BROADCAST:
 213       case SHADER_OPCODE_MOV_INDIRECT:
 214          /* The source and destination types of these may be hard-coded to
 215           * integer at codegen time due to hardware limitations of 64-bit
 216           * types.
 217           */
 218          return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
 219                  devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
 220                 type_sz(inst->src[0].type) > 4 &&
 221                 inst->dst.type != inst->src[0].type;
 222       default:
 223          /* FIXME: We assume the opcodes don't explicitly mentioned before
 224           * just work fine with arbitrary conversions.
 225           */
 226          return false;
 227       }
 228    }
 229
 230    /**
 231     * Return whether the instruction has non-standard semantics for the
 232     * conditional mod which don't cause the flag register to be updated with
 233     * the comparison result.
 234     */
 235    bool
 236    has_inconsistent_cmod(const fs_inst *inst)
 237    {
 238       return inst->opcode == BRW_OPCODE_SEL ||
 239              inst->opcode == BRW_OPCODE_CSEL ||
 240              inst->opcode == BRW_OPCODE_IF ||
 241              inst->opcode == BRW_OPCODE_WHILE;
 242    }
 243
 244    bool
 245    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
 246 }
 247
 248 namespace brw {
 249    /**
 250     * Remove any modifiers from the \p i-th source region of the instruction,
 251     * including negate, abs and any implicit type conversion to the execution
 252     * type.  Instead any source modifiers will be implemented as a separate
 253     * MOV instruction prior to the original instruction.
 254     */
 255    bool
 256    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 257    {
 258       assert(inst->components_read(i) == 1);
 259       assert(v->devinfo->has_integer_dword_mul ||
 260              inst->opcode != BRW_OPCODE_MUL ||
 261              brw_reg_type_is_floating_point(get_exec_type(inst)) ||
 262              MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
 263              type_sz(inst->src[i].type) == get_exec_type_size(inst));
 264
 265       const fs_builder ibld(v, block, inst);
 266       const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
 267
 268       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
 269       inst->src[i] = tmp;
 270
 271       return true;
 272    }
 273 }
 274
 275 namespace {
 276    /**
 277     * Remove any modifiers from the destination region of the instruction,
 278     * including saturate, conditional mod and any implicit type conversion
 279     * from the execution type.  Instead any destination modifiers will be
 280     * implemented as a separate MOV instruction after the original
 281     * instruction.
 282     */
 283    bool
 284    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
 285    {
 286       const fs_builder ibld(v, block, inst);
 287       const brw_reg_type type = get_exec_type(inst);
 288       /* Not strictly necessary, but if possible use a temporary with the same
 289        * channel alignment as the current destination in order to avoid
 290        * violating the restrictions enforced later on by lower_src_region()
 291        * and lower_dst_region(), which would introduce additional copy
 292        * instructions into the program unnecessarily.
 293        */
 294       const unsigned stride =
 295          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
 296          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
 297       fs_reg tmp = ibld.vgrf(type, stride);
 298       ibld.UNDEF(tmp);
 299       tmp = horiz_stride(tmp, stride);
 300
 301       /* Emit a MOV taking care of all the destination modifiers. */
 302       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
 303       mov->saturate = inst->saturate;
 304       if (!has_inconsistent_cmod(inst))
 305          mov->conditional_mod = inst->conditional_mod;
 306       if (inst->opcode != BRW_OPCODE_SEL) {
 307          mov->predicate = inst->predicate;
 308          mov->predicate_inverse = inst->predicate_inverse;
 309       }
 310       mov->flag_subreg = inst->flag_subreg;
 311       lower_instruction(v, block, mov);
 312
 313       /* Point the original instruction at the temporary, and clean up any
 314        * destination modifiers.
 315        */
 316       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 317       inst->dst = tmp;
 318       inst->size_written = inst->dst.component_size(inst->exec_size);
 319       inst->saturate = false;
 320       if (!has_inconsistent_cmod(inst))
 321          inst->conditional_mod = BRW_CONDITIONAL_NONE;
 322
 323       assert(!inst->flags_written() || !mov->predicate);
 324       return true;
 325    }
 326
 327    /**
 328     * Remove any non-trivial shuffling of data from the \p i-th source region
 329     * of the instruction.  Instead implement the region as a series of integer
 330     * copies into a temporary with the same channel layout as the destination.
 331     */
 332    bool
 333    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
 334    {
 335       assert(inst->components_read(i) == 1);
 336       const fs_builder ibld(v, block, inst);
 337       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
 338                               type_sz(inst->src[i].type);
 339       assert(stride > 0);
 340       fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
 341       ibld.UNDEF(tmp);
 342       tmp = horiz_stride(tmp, stride);
 343
 344       /* Emit a series of 32-bit integer copies with any source modifiers
 345        * cleaned up (because their semantics are dependent on the type).
 346        */
 347       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 348                                                  false);
 349       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 350       fs_reg raw_src = inst->src[i];
 351       raw_src.negate = false;
 352       raw_src.abs = false;
 353
 354       for (unsigned j = 0; j < n; j++)
 355          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
 356
 357       /* Point the original instruction at the temporary, making sure to keep
 358        * any source modifiers in the instruction.
 359        */
 360       fs_reg lower_src = tmp;
 361       lower_src.negate = inst->src[i].negate;
 362       lower_src.abs = inst->src[i].abs;
 363       inst->src[i] = lower_src;
 364
 365       return true;
 366    }
 367
 368    /**
 369     * Remove any non-trivial shuffling of data from the destination region of
 370     * the instruction.  Instead implement the region as a series of integer
 371     * copies from a temporary with a channel layout compatible with the
 372     * sources.
 373     */
 374    bool
 375    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
 376    {
 377       /* We cannot replace the result of an integer multiply which writes the
 378        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
 379        * value whereas the MOV will act on only 32 or 33 bits of the
 380        * accumulator.
 381        */
 382       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
 383              brw_reg_type_is_floating_point(inst->dst.type));
 384
 385       const fs_builder ibld(v, block, inst);
 386       const unsigned stride = required_dst_byte_stride(inst) /
 387                               type_sz(inst->dst.type);
 388       assert(stride > 0);
 389       fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
 390       ibld.UNDEF(tmp);
 391       tmp = horiz_stride(tmp, stride);
 392
 393       /* Emit a series of 32-bit integer copies from the temporary into the
 394        * original destination.
 395        */
 396       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
 397                                                  false);
 398       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
 399
 400       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
 401          /* Note that in general we cannot simply predicate the copies on the
 402           * same flag register as the original instruction, since it may have
 403           * been overwritten by the instruction itself.  Instead initialize
 404           * the temporary with the previous contents of the destination
 405           * register.
 406           */
 407          for (unsigned j = 0; j < n; j++)
 408             ibld.MOV(subscript(tmp, raw_type, j),
 409                      subscript(inst->dst, raw_type, j));
 410       }
 411
 412       for (unsigned j = 0; j < n; j++)
 413          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
 414                                         subscript(tmp, raw_type, j));
 415
 416       /* Point the original instruction at the temporary, making sure to keep
 417        * any destination modifiers in the instruction.
 418        */
 419       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
 420       inst->dst = tmp;
 421       inst->size_written = inst->dst.component_size(inst->exec_size);
 422
 423       return true;
 424    }
 425
 426    /**
 427     * Legalize the source and destination regioning controls of the specified
 428     * instruction.
 429     */
 430    bool
 431    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
 432    {
 433       const gen_device_info *devinfo = v->devinfo;
 434       bool progress = false;
 435
 436       if (has_invalid_conversion(devinfo, inst))
 437          progress |= lower_dst_modifiers(v, block, inst);
 438
 439       if (has_invalid_dst_region(devinfo, inst))
 440          progress |= lower_dst_region(v, block, inst);
 441
 442       for (unsigned i = 0; i < inst->sources; i++) {
 443          if (has_invalid_src_modifiers(devinfo, inst, i))
 444             progress |= lower_src_modifiers(v, block, inst, i);
 445
 446          if (has_invalid_src_region(devinfo, inst, i))
 447             progress |= lower_src_region(v, block, inst, i);
 448       }
 449
 450       return progress;
 451    }
 452 }
 453
 454 bool
 455 fs_visitor::lower_regioning()
 456 {
 457    bool progress = false;
 458
 459    foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
 460       progress |= lower_instruction(this, block, inst);
 461
 462    if (progress)
 463       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
 464
 465    return progress;
 466 }