src/intel/compiler/brw_vec4_copy_propagation.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * @file brw_vec4_copy_propagation.cpp
  26  *
  27  * Implements tracking of values copied between registers, and
  28  * optimizations based on that: copy propagation and constant
  29  * propagation.
  30  */
  31
  32 #include "brw_vec4.h"
  33 #include "brw_cfg.h"
  34 #include "brw_eu.h"
  35
  36 namespace brw {
  37
  38 struct copy_entry {
  39    src_reg *value[4];
  40    int saturatemask;
  41 };
  42
  43 static bool
  44 is_direct_copy(vec4_instruction *inst)
  45 {
  46    return (inst->opcode == BRW_OPCODE_MOV &&
  47            !inst->predicate &&
  48            inst->dst.file == VGRF &&
  49            inst->dst.offset % REG_SIZE == 0 &&
  50            !inst->dst.reladdr &&
  51            !inst->src[0].reladdr &&
  52            (inst->dst.type == inst->src[0].type ||
  53             (inst->dst.type == BRW_REGISTER_TYPE_F &&
  54              inst->src[0].type == BRW_REGISTER_TYPE_VF)));
  55 }
  56
  57 static bool
  58 is_dominated_by_previous_instruction(vec4_instruction *inst)
  59 {
  60    return (inst->opcode != BRW_OPCODE_DO &&
  61            inst->opcode != BRW_OPCODE_WHILE &&
  62            inst->opcode != BRW_OPCODE_ELSE &&
  63            inst->opcode != BRW_OPCODE_ENDIF);
  64 }
  65
  66 static bool
  67 is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
  68 {
  69    const src_reg *src = values[ch];
  70
  71    /* consider GRF only */
  72    assert(inst->dst.file == VGRF);
  73    if (!src || src->file != VGRF)
  74       return false;
  75
  76    return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
  77           (inst->dst.offset != src->offset ||
  78            inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
  79 }
  80
  81 static bool
  82 is_logic_op(enum opcode opcode)
  83 {
  84    return (opcode == BRW_OPCODE_AND ||
  85            opcode == BRW_OPCODE_OR  ||
  86            opcode == BRW_OPCODE_XOR ||
  87            opcode == BRW_OPCODE_NOT);
  88 }
  89
  90 /**
  91  * Get the origin of a copy as a single register if all components present in
  92  * the given readmask originate from the same register and have compatible
  93  * regions, otherwise return a BAD_FILE register.
  94  */
  95 static src_reg
  96 get_copy_value(const copy_entry &entry, unsigned readmask)
  97 {
  98    unsigned swz[4] = {};
  99    src_reg value;
 100
 101    for (unsigned i = 0; i < 4; i++) {
 102       if (readmask & (1 << i)) {
 103          if (entry.value[i]) {
 104             src_reg src = *entry.value[i];
 105
 106             if (src.file == IMM) {
 107                swz[i] = i;
 108             } else {
 109                swz[i] = BRW_GET_SWZ(src.swizzle, i);
 110                /* Overwrite the original swizzle so the src_reg::equals call
 111                 * below doesn't care about it, the correct swizzle will be
 112                 * calculated once the swizzles of all components are known.
 113                 */
 114                src.swizzle = BRW_SWIZZLE_XYZW;
 115             }
 116
 117             if (value.file == BAD_FILE) {
 118                value = src;
 119             } else if (!value.equals(src)) {
 120                return src_reg();
 121             }
 122          } else {
 123             return src_reg();
 124          }
 125       }
 126    }
 127
 128    return swizzle(value,
 129                   brw_compose_swizzle(brw_swizzle_for_mask(readmask),
 130                                       BRW_SWIZZLE4(swz[0], swz[1],
 131                                                    swz[2], swz[3])));
 132 }
 133
 134 static bool
 135 try_constant_propagate(const struct gen_device_info *devinfo,
 136                        vec4_instruction *inst,
 137                        int arg, const copy_entry *entry)
 138 {
 139    /* For constant propagation, we only handle the same constant
 140     * across all 4 channels.  Some day, we should handle the 8-bit
 141     * float vector format, which would let us constant propagate
 142     * vectors better.
 143     * We could be more aggressive here -- some channels might not get used
 144     * based on the destination writemask.
 145     */
 146    src_reg value =
 147       get_copy_value(*entry,
 148                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 149                                                    WRITEMASK_XYZW));
 150
 151    if (value.file != IMM)
 152       return false;
 153
 154    /* 64-bit types can't be used except for one-source instructions, which
 155     * higher levels should have constant folded away, so there's no point in
 156     * propagating immediates here.
 157     */
 158    if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
 159       return false;
 160
 161    if (value.type == BRW_REGISTER_TYPE_VF) {
 162       /* The result of bit-casting the component values of a vector float
 163        * cannot in general be represented as an immediate.
 164        */
 165       if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
 166          return false;
 167    } else {
 168       value.type = inst->src[arg].type;
 169    }
 170
 171    if (inst->src[arg].abs) {
 172       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 173           !brw_abs_immediate(value.type, &value.as_brw_reg())) {
 174          return false;
 175       }
 176    }
 177
 178    if (inst->src[arg].negate) {
 179       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 180           !brw_negate_immediate(value.type, &value.as_brw_reg())) {
 181          return false;
 182       }
 183    }
 184
 185    value = swizzle(value, inst->src[arg].swizzle);
 186
 187    switch (inst->opcode) {
 188    case BRW_OPCODE_MOV:
 189    case SHADER_OPCODE_BROADCAST:
 190       inst->src[arg] = value;
 191       return true;
 192
 193    case SHADER_OPCODE_POW:
 194    case SHADER_OPCODE_INT_QUOTIENT:
 195    case SHADER_OPCODE_INT_REMAINDER:
 196       if (devinfo->gen < 8)
 197          break;
 198       /* fallthrough */
 199    case BRW_OPCODE_DP2:
 200    case BRW_OPCODE_DP3:
 201    case BRW_OPCODE_DP4:
 202    case BRW_OPCODE_DPH:
 203    case BRW_OPCODE_BFI1:
 204    case BRW_OPCODE_ASR:
 205    case BRW_OPCODE_SHL:
 206    case BRW_OPCODE_SHR:
 207    case BRW_OPCODE_SUBB:
 208       if (arg == 1) {
 209          inst->src[arg] = value;
 210          return true;
 211       }
 212       break;
 213
 214    case BRW_OPCODE_MACH:
 215    case BRW_OPCODE_MUL:
 216    case SHADER_OPCODE_MULH:
 217    case BRW_OPCODE_ADD:
 218    case BRW_OPCODE_OR:
 219    case BRW_OPCODE_AND:
 220    case BRW_OPCODE_XOR:
 221    case BRW_OPCODE_ADDC:
 222       if (arg == 1) {
 223          inst->src[arg] = value;
 224          return true;
 225       } else if (arg == 0 && inst->src[1].file != IMM) {
 226          /* Fit this constant in by commuting the operands.  Exception: we
 227           * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
 228           */
 229          if ((inst->opcode == BRW_OPCODE_MUL ||
 230               inst->opcode == BRW_OPCODE_MACH) &&
 231              (inst->src[1].type == BRW_REGISTER_TYPE_D ||
 232               inst->src[1].type == BRW_REGISTER_TYPE_UD))
 233             break;
 234          inst->src[0] = inst->src[1];
 235          inst->src[1] = value;
 236          return true;
 237       }
 238       break;
 239    case GS_OPCODE_SET_WRITE_OFFSET:
 240       /* This is just a multiply by a constant with special strides.
 241        * The generator will handle immediates in both arguments (generating
 242        * a single MOV of the product).  So feel free to propagate in src0.
 243        */
 244       inst->src[arg] = value;
 245       return true;
 246
 247    case BRW_OPCODE_CMP:
 248       if (arg == 1) {
 249          inst->src[arg] = value;
 250          return true;
 251       } else if (arg == 0 && inst->src[1].file != IMM) {
 252          enum brw_conditional_mod new_cmod;
 253
 254          new_cmod = brw_swap_cmod(inst->conditional_mod);
 255          if (new_cmod != BRW_CONDITIONAL_NONE) {
 256             /* Fit this constant in by swapping the operands and
 257              * flipping the test.
 258              */
 259             inst->src[0] = inst->src[1];
 260             inst->src[1] = value;
 261             inst->conditional_mod = new_cmod;
 262             return true;
 263          }
 264       }
 265       break;
 266
 267    case BRW_OPCODE_SEL:
 268       if (arg == 1) {
 269          inst->src[arg] = value;
 270          return true;
 271       } else if (arg == 0 && inst->src[1].file != IMM) {
 272          inst->src[0] = inst->src[1];
 273          inst->src[1] = value;
 274
 275          /* If this was predicated, flipping operands means
 276           * we also need to flip the predicate.
 277           */
 278          if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
 279             inst->predicate_inverse = !inst->predicate_inverse;
 280          }
 281          return true;
 282       }
 283       break;
 284
 285    default:
 286       break;
 287    }
 288
 289    return false;
 290 }
 291
 292 static bool
 293 is_align1_opcode(unsigned opcode)
 294 {
 295    switch (opcode) {
 296    case VEC4_OPCODE_DOUBLE_TO_F32:
 297    case VEC4_OPCODE_DOUBLE_TO_D32:
 298    case VEC4_OPCODE_DOUBLE_TO_U32:
 299    case VEC4_OPCODE_TO_DOUBLE:
 300    case VEC4_OPCODE_PICK_LOW_32BIT:
 301    case VEC4_OPCODE_PICK_HIGH_32BIT:
 302    case VEC4_OPCODE_SET_LOW_32BIT:
 303    case VEC4_OPCODE_SET_HIGH_32BIT:
 304       return true;
 305    default:
 306       return false;
 307    }
 308 }
 309
 310 static bool
 311 try_copy_propagate(const struct gen_device_info *devinfo,
 312                    vec4_instruction *inst, int arg,
 313                    const copy_entry *entry, int attributes_per_reg)
 314 {
 315    /* Build up the value we are propagating as if it were the source of a
 316     * single MOV
 317     */
 318    src_reg value =
 319       get_copy_value(*entry,
 320                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 321                                                    WRITEMASK_XYZW));
 322
 323    /* Check that we can propagate that value */
 324    if (value.file != UNIFORM &&
 325        value.file != VGRF &&
 326        value.file != ATTR)
 327       return false;
 328
 329    /* In gen < 8 instructions that write 2 registers also need to read 2
 330     * registers. Make sure we don't break that restriction by copy
 331     * propagating from a uniform.
 332     */
 333    if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value))
 334       return false;
 335
 336    /* There is a regioning restriction such that if execsize == width
 337     * and hstride != 0 then the vstride can't be 0. When we split instrutions
 338     * that take a single-precision source (like F->DF conversions) we end up
 339     * with a 4-wide source on an instruction with an execution size of 4.
 340     * If we then copy-propagate the source from a uniform we also end up with a
 341     * vstride of 0 and we violate the restriction.
 342     */
 343    if (inst->exec_size == 4 && value.file == UNIFORM &&
 344        type_sz(value.type) == 4)
 345       return false;
 346
 347    /* If the type of the copy value is different from the type of the
 348     * instruction then the swizzles and writemasks involved don't have the same
 349     * meaning and simply replacing the source would produce different semantics.
 350     */
 351    if (type_sz(value.type) != type_sz(inst->src[arg].type))
 352       return false;
 353
 354    if (devinfo->gen >= 8 && (value.negate || value.abs) &&
 355        is_logic_op(inst->opcode)) {
 356       return false;
 357    }
 358
 359    if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
 360       return false;
 361
 362    bool has_source_modifiers = value.negate || value.abs;
 363
 364    /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
 365     * instructions.
 366     */
 367    if ((has_source_modifiers || value.file == UNIFORM ||
 368         value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
 369       return false;
 370
 371    if (has_source_modifiers &&
 372        value.type != inst->src[arg].type &&
 373        !inst->can_change_types())
 374       return false;
 375
 376    if (has_source_modifiers &&
 377        inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
 378       return false;
 379
 380    unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
 381                                                    value.swizzle);
 382
 383    /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
 384     * so copy-propagation won't be safe if the composed swizzle is anything
 385     * other than the identity.
 386     */
 387    if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
 388       return false;
 389
 390    if (inst->is_3src(devinfo) &&
 391        (value.file == UNIFORM ||
 392         (value.file == ATTR && attributes_per_reg != 1)) &&
 393        !brw_is_single_value_swizzle(composed_swizzle))
 394       return false;
 395
 396    if (inst->is_send_from_grf())
 397       return false;
 398
 399    /* we can't generally copy-propagate UD negations becuse we
 400     * end up accessing the resulting values as signed integers
 401     * instead. See also resolve_ud_negate().
 402     */
 403    if (value.negate &&
 404        value.type == BRW_REGISTER_TYPE_UD)
 405       return false;
 406
 407    /* Don't report progress if this is a noop. */
 408    if (value.equals(inst->src[arg]))
 409       return false;
 410
 411    const unsigned dst_saturate_mask = inst->dst.writemask &
 412       brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
 413
 414    if (dst_saturate_mask) {
 415       /* We either saturate all or nothing. */
 416       if (dst_saturate_mask != inst->dst.writemask)
 417          return false;
 418
 419       /* Limit saturate propagation only to SEL with src1 bounded within 0.0
 420        * and 1.0, otherwise skip copy propagate altogether.
 421        */
 422       switch(inst->opcode) {
 423       case BRW_OPCODE_SEL:
 424          if (arg != 0 ||
 425              inst->src[0].type != BRW_REGISTER_TYPE_F ||
 426              inst->src[1].file != IMM ||
 427              inst->src[1].type != BRW_REGISTER_TYPE_F ||
 428              inst->src[1].f < 0.0 ||
 429              inst->src[1].f > 1.0) {
 430             return false;
 431          }
 432          if (!inst->saturate)
 433             inst->saturate = true;
 434          break;
 435       default:
 436          return false;
 437       }
 438    }
 439
 440    /* Build the final value */
 441    if (inst->src[arg].abs) {
 442       value.negate = false;
 443       value.abs = true;
 444    }
 445    if (inst->src[arg].negate)
 446       value.negate = !value.negate;
 447
 448    value.swizzle = composed_swizzle;
 449    if (has_source_modifiers &&
 450        value.type != inst->src[arg].type) {
 451       assert(inst->can_change_types());
 452       for (int i = 0; i < 3; i++) {
 453          inst->src[i].type = value.type;
 454       }
 455       inst->dst.type = value.type;
 456    } else {
 457       value.type = inst->src[arg].type;
 458    }
 459
 460    inst->src[arg] = value;
 461    return true;
 462 }
 463
 464 bool
 465 vec4_visitor::opt_copy_propagation(bool do_constant_prop)
 466 {
 467    /* If we are in dual instanced or single mode, then attributes are going
 468     * to be interleaved, so one register contains two attribute slots.
 469     */
 470    const int attributes_per_reg =
 471       prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 472    bool progress = false;
 473    struct copy_entry entries[alloc.total_size];
 474
 475    memset(&entries, 0, sizeof(entries));
 476
 477    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 478       /* This pass only works on basic blocks.  If there's flow
 479        * control, throw out all our information and start from
 480        * scratch.
 481        *
 482        * This should really be fixed by using a structure like in
 483        * src/glsl/opt_copy_propagation.cpp to track available copies.
 484        */
 485       if (!is_dominated_by_previous_instruction(inst)) {
 486          memset(&entries, 0, sizeof(entries));
 487          continue;
 488       }
 489
 490       /* For each source arg, see if each component comes from a copy
 491        * from the same type file (IMM, VGRF, UNIFORM), and try
 492        * optimizing out access to the copy result
 493        */
 494       for (int i = 2; i >= 0; i--) {
 495          /* Copied values end up in GRFs, and we don't track reladdr
 496           * accesses.
 497           */
 498          if (inst->src[i].file != VGRF ||
 499              inst->src[i].reladdr)
 500             continue;
 501
 502          /* We only handle register-aligned single GRF copies. */
 503          if (inst->size_read(i) != REG_SIZE ||
 504              inst->src[i].offset % REG_SIZE)
 505             continue;
 506
 507          const unsigned reg = (alloc.offsets[inst->src[i].nr] +
 508                                inst->src[i].offset / REG_SIZE);
 509          const copy_entry &entry = entries[reg];
 510
 511          if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
 512             progress = true;
 513          else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
 514             progress = true;
 515       }
 516
 517       /* Track available source registers. */
 518       if (inst->dst.file == VGRF) {
 519          const int reg =
 520             alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
 521
 522          /* Update our destination's current channel values.  For a direct copy,
 523           * the value is the newly propagated source.  Otherwise, we don't know
 524           * the new value, so clear it.
 525           */
 526          bool direct_copy = is_direct_copy(inst);
 527          entries[reg].saturatemask &= ~inst->dst.writemask;
 528          for (int i = 0; i < 4; i++) {
 529             if (inst->dst.writemask & (1 << i)) {
 530                entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
 531                entries[reg].saturatemask |=
 532                   inst->saturate && direct_copy ? 1 << i : 0;
 533             }
 534          }
 535
 536          /* Clear the records for any registers whose current value came from
 537           * our destination's updated channels, as the two are no longer equal.
 538           */
 539          if (inst->dst.reladdr)
 540             memset(&entries, 0, sizeof(entries));
 541          else {
 542             for (unsigned i = 0; i < alloc.total_size; i++) {
 543                for (int j = 0; j < 4; j++) {
 544                   if (is_channel_updated(inst, entries[i].value, j)) {
 545                      entries[i].value[j] = NULL;
 546                      entries[i].saturatemask &= ~(1 << j);
 547                   }
 548                }
 549             }
 550          }
 551       }
 552    }
 553
 554    if (progress)
 555       invalidate_live_intervals();
 556
 557    return progress;
 558 }
 559
 560 } /* namespace brw */