src/intel/compiler/brw_vec4_copy_propagation.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * @file brw_vec4_copy_propagation.cpp
  26  *
  27  * Implements tracking of values copied between registers, and
  28  * optimizations based on that: copy propagation and constant
  29  * propagation.
  30  */
  31
  32 #include "brw_vec4.h"
  33 #include "brw_cfg.h"
  34 #include "brw_eu.h"
  35
  36 namespace brw {
  37
  38 struct copy_entry {
  39    src_reg *value[4];
  40    int saturatemask;
  41 };
  42
  43 static bool
  44 is_direct_copy(vec4_instruction *inst)
  45 {
  46    return (inst->opcode == BRW_OPCODE_MOV &&
  47            !inst->predicate &&
  48            inst->dst.file == VGRF &&
  49            inst->dst.offset % REG_SIZE == 0 &&
  50            !inst->dst.reladdr &&
  51            !inst->src[0].reladdr &&
  52            (inst->dst.type == inst->src[0].type ||
  53             (inst->dst.type == BRW_REGISTER_TYPE_F &&
  54              inst->src[0].type == BRW_REGISTER_TYPE_VF)));
  55 }
  56
  57 static bool
  58 is_dominated_by_previous_instruction(vec4_instruction *inst)
  59 {
  60    return (inst->opcode != BRW_OPCODE_DO &&
  61            inst->opcode != BRW_OPCODE_WHILE &&
  62            inst->opcode != BRW_OPCODE_ELSE &&
  63            inst->opcode != BRW_OPCODE_ENDIF);
  64 }
  65
  66 static bool
  67 is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
  68 {
  69    const src_reg *src = values[ch];
  70
  71    /* consider GRF only */
  72    assert(inst->dst.file == VGRF);
  73    if (!src || src->file != VGRF)
  74       return false;
  75
  76    return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
  77           (inst->dst.offset != src->offset ||
  78            inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
  79 }
  80
  81 static bool
  82 is_logic_op(enum opcode opcode)
  83 {
  84    return (opcode == BRW_OPCODE_AND ||
  85            opcode == BRW_OPCODE_OR  ||
  86            opcode == BRW_OPCODE_XOR ||
  87            opcode == BRW_OPCODE_NOT);
  88 }
  89
  90 /**
  91  * Get the origin of a copy as a single register if all components present in
  92  * the given readmask originate from the same register and have compatible
  93  * regions, otherwise return a BAD_FILE register.
  94  */
  95 static src_reg
  96 get_copy_value(const copy_entry &entry, unsigned readmask)
  97 {
  98    unsigned swz[4] = {};
  99    src_reg value;
 100
 101    for (unsigned i = 0; i < 4; i++) {
 102       if (readmask & (1 << i)) {
 103          if (entry.value[i]) {
 104             src_reg src = *entry.value[i];
 105
 106             if (src.file == IMM) {
 107                swz[i] = i;
 108             } else {
 109                swz[i] = BRW_GET_SWZ(src.swizzle, i);
 110                /* Overwrite the original swizzle so the src_reg::equals call
 111                 * below doesn't care about it, the correct swizzle will be
 112                 * calculated once the swizzles of all components are known.
 113                 */
 114                src.swizzle = BRW_SWIZZLE_XYZW;
 115             }
 116
 117             if (value.file == BAD_FILE) {
 118                value = src;
 119             } else if (!value.equals(src)) {
 120                return src_reg();
 121             }
 122          } else {
 123             return src_reg();
 124          }
 125       }
 126    }
 127
 128    return swizzle(value,
 129                   brw_compose_swizzle(brw_swizzle_for_mask(readmask),
 130                                       BRW_SWIZZLE4(swz[0], swz[1],
 131                                                    swz[2], swz[3])));
 132 }
 133
 134 static bool
 135 try_constant_propagate(const struct gen_device_info *devinfo,
 136                        vec4_instruction *inst,
 137                        int arg, const copy_entry *entry)
 138 {
 139    /* For constant propagation, we only handle the same constant
 140     * across all 4 channels.  Some day, we should handle the 8-bit
 141     * float vector format, which would let us constant propagate
 142     * vectors better.
 143     * We could be more aggressive here -- some channels might not get used
 144     * based on the destination writemask.
 145     */
 146    src_reg value =
 147       get_copy_value(*entry,
 148                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 149                                                    WRITEMASK_XYZW));
 150
 151    if (value.file != IMM)
 152       return false;
 153
 154    /* 64-bit types can't be used except for one-source instructions, which
 155     * higher levels should have constant folded away, so there's no point in
 156     * propagating immediates here.
 157     */
 158    if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
 159       return false;
 160
 161    if (value.type == BRW_REGISTER_TYPE_VF) {
 162       /* The result of bit-casting the component values of a vector float
 163        * cannot in general be represented as an immediate.
 164        */
 165       if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
 166          return false;
 167    } else {
 168       value.type = inst->src[arg].type;
 169    }
 170
 171    if (inst->src[arg].abs) {
 172       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 173           !brw_abs_immediate(value.type, &value.as_brw_reg())) {
 174          return false;
 175       }
 176    }
 177
 178    if (inst->src[arg].negate) {
 179       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 180           !brw_negate_immediate(value.type, &value.as_brw_reg())) {
 181          return false;
 182       }
 183    }
 184
 185    value = swizzle(value, inst->src[arg].swizzle);
 186
 187    switch (inst->opcode) {
 188    case BRW_OPCODE_MOV:
 189    case SHADER_OPCODE_BROADCAST:
 190       inst->src[arg] = value;
 191       return true;
 192
 193    case SHADER_OPCODE_POW:
 194    case SHADER_OPCODE_INT_QUOTIENT:
 195    case SHADER_OPCODE_INT_REMAINDER:
 196       if (devinfo->gen < 8)
 197          break;
 198       /* fallthrough */
 199    case BRW_OPCODE_DP2:
 200    case BRW_OPCODE_DP3:
 201    case BRW_OPCODE_DP4:
 202    case BRW_OPCODE_DPH:
 203    case BRW_OPCODE_BFI1:
 204    case BRW_OPCODE_ASR:
 205    case BRW_OPCODE_SHL:
 206    case BRW_OPCODE_SHR:
 207    case BRW_OPCODE_SUBB:
 208       if (arg == 1) {
 209          inst->src[arg] = value;
 210          return true;
 211       }
 212       break;
 213
 214    case BRW_OPCODE_MACH:
 215    case BRW_OPCODE_MUL:
 216    case SHADER_OPCODE_MULH:
 217    case BRW_OPCODE_ADD:
 218    case BRW_OPCODE_OR:
 219    case BRW_OPCODE_AND:
 220    case BRW_OPCODE_XOR:
 221    case BRW_OPCODE_ADDC:
 222       if (arg == 1) {
 223          inst->src[arg] = value;
 224          return true;
 225       } else if (arg == 0 && inst->src[1].file != IMM) {
 226          /* Fit this constant in by commuting the operands.  Exception: we
 227           * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
 228           */
 229          if ((inst->opcode == BRW_OPCODE_MUL ||
 230               inst->opcode == BRW_OPCODE_MACH) &&
 231              (inst->src[1].type == BRW_REGISTER_TYPE_D ||
 232               inst->src[1].type == BRW_REGISTER_TYPE_UD))
 233             break;
 234          inst->src[0] = inst->src[1];
 235          inst->src[1] = value;
 236          return true;
 237       }
 238       break;
 239    case GS_OPCODE_SET_WRITE_OFFSET:
 240       /* This is just a multiply by a constant with special strides.
 241        * The generator will handle immediates in both arguments (generating
 242        * a single MOV of the product).  So feel free to propagate in src0.
 243        */
 244       inst->src[arg] = value;
 245       return true;
 246
 247    case BRW_OPCODE_CMP:
 248       if (arg == 1) {
 249          inst->src[arg] = value;
 250          return true;
 251       } else if (arg == 0 && inst->src[1].file != IMM) {
 252          enum brw_conditional_mod new_cmod;
 253
 254          new_cmod = brw_swap_cmod(inst->conditional_mod);
 255          if (new_cmod != BRW_CONDITIONAL_NONE) {
 256             /* Fit this constant in by swapping the operands and
 257              * flipping the test.
 258              */
 259             inst->src[0] = inst->src[1];
 260             inst->src[1] = value;
 261             inst->conditional_mod = new_cmod;
 262             return true;
 263          }
 264       }
 265       break;
 266
 267    case BRW_OPCODE_SEL:
 268       if (arg == 1) {
 269          inst->src[arg] = value;
 270          return true;
 271       } else if (arg == 0 && inst->src[1].file != IMM) {
 272          inst->src[0] = inst->src[1];
 273          inst->src[1] = value;
 274
 275          /* If this was predicated, flipping operands means
 276           * we also need to flip the predicate.
 277           */
 278          if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
 279             inst->predicate_inverse = !inst->predicate_inverse;
 280          }
 281          return true;
 282       }
 283       break;
 284
 285    default:
 286       break;
 287    }
 288
 289    return false;
 290 }
 291
 292 static bool
 293 is_align1_opcode(unsigned opcode)
 294 {
 295    switch (opcode) {
 296    case VEC4_OPCODE_FROM_DOUBLE:
 297    case VEC4_OPCODE_TO_DOUBLE:
 298    case VEC4_OPCODE_PICK_LOW_32BIT:
 299    case VEC4_OPCODE_PICK_HIGH_32BIT:
 300    case VEC4_OPCODE_SET_LOW_32BIT:
 301    case VEC4_OPCODE_SET_HIGH_32BIT:
 302       return true;
 303    default:
 304       return false;
 305    }
 306 }
 307
 308 static bool
 309 try_copy_propagate(const struct gen_device_info *devinfo,
 310                    vec4_instruction *inst, int arg,
 311                    const copy_entry *entry, int attributes_per_reg)
 312 {
 313    /* Build up the value we are propagating as if it were the source of a
 314     * single MOV
 315     */
 316    src_reg value =
 317       get_copy_value(*entry,
 318                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 319                                                    WRITEMASK_XYZW));
 320
 321    /* Check that we can propagate that value */
 322    if (value.file != UNIFORM &&
 323        value.file != VGRF &&
 324        value.file != ATTR)
 325       return false;
 326
 327    /* In gen < 8 instructions that write 2 registers also need to read 2
 328     * registers. Make sure we don't break that restriction by copy
 329     * propagating from a uniform.
 330     */
 331    if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value))
 332       return false;
 333
 334    /* There is a regioning restriction such that if execsize == width
 335     * and hstride != 0 then the vstride can't be 0. When we split instrutions
 336     * that take a single-precision source (like F->DF conversions) we end up
 337     * with a 4-wide source on an instruction with an execution size of 4.
 338     * If we then copy-propagate the source from a uniform we also end up with a
 339     * vstride of 0 and we violate the restriction.
 340     */
 341    if (inst->exec_size == 4 && value.file == UNIFORM &&
 342        type_sz(value.type) == 4)
 343       return false;
 344
 345    /* If the type of the copy value is different from the type of the
 346     * instruction then the swizzles and writemasks involved don't have the same
 347     * meaning and simply replacing the source would produce different semantics.
 348     */
 349    if (type_sz(value.type) != type_sz(inst->src[arg].type))
 350       return false;
 351
 352    if (devinfo->gen >= 8 && (value.negate || value.abs) &&
 353        is_logic_op(inst->opcode)) {
 354       return false;
 355    }
 356
 357    if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
 358       return false;
 359
 360    bool has_source_modifiers = value.negate || value.abs;
 361
 362    /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
 363     * instructions.
 364     */
 365    if ((has_source_modifiers || value.file == UNIFORM ||
 366         value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
 367       return false;
 368
 369    if (has_source_modifiers &&
 370        value.type != inst->src[arg].type &&
 371        !inst->can_change_types())
 372       return false;
 373
 374    if (has_source_modifiers &&
 375        inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
 376       return false;
 377
 378    unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
 379                                                    value.swizzle);
 380
 381    /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
 382     * so copy-propagation won't be safe if the composed swizzle is anything
 383     * other than the identity.
 384     */
 385    if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
 386       return false;
 387
 388    if (inst->is_3src(devinfo) &&
 389        (value.file == UNIFORM ||
 390         (value.file == ATTR && attributes_per_reg != 1)) &&
 391        !brw_is_single_value_swizzle(composed_swizzle))
 392       return false;
 393
 394    if (inst->is_send_from_grf())
 395       return false;
 396
 397    /* we can't generally copy-propagate UD negations becuse we
 398     * end up accessing the resulting values as signed integers
 399     * instead. See also resolve_ud_negate().
 400     */
 401    if (value.negate &&
 402        value.type == BRW_REGISTER_TYPE_UD)
 403       return false;
 404
 405    /* Don't report progress if this is a noop. */
 406    if (value.equals(inst->src[arg]))
 407       return false;
 408
 409    const unsigned dst_saturate_mask = inst->dst.writemask &
 410       brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
 411
 412    if (dst_saturate_mask) {
 413       /* We either saturate all or nothing. */
 414       if (dst_saturate_mask != inst->dst.writemask)
 415          return false;
 416
 417       /* Limit saturate propagation only to SEL with src1 bounded within 0.0
 418        * and 1.0, otherwise skip copy propagate altogether.
 419        */
 420       switch(inst->opcode) {
 421       case BRW_OPCODE_SEL:
 422          if (arg != 0 ||
 423              inst->src[0].type != BRW_REGISTER_TYPE_F ||
 424              inst->src[1].file != IMM ||
 425              inst->src[1].type != BRW_REGISTER_TYPE_F ||
 426              inst->src[1].f < 0.0 ||
 427              inst->src[1].f > 1.0) {
 428             return false;
 429          }
 430          if (!inst->saturate)
 431             inst->saturate = true;
 432          break;
 433       default:
 434          return false;
 435       }
 436    }
 437
 438    /* Build the final value */
 439    if (inst->src[arg].abs) {
 440       value.negate = false;
 441       value.abs = true;
 442    }
 443    if (inst->src[arg].negate)
 444       value.negate = !value.negate;
 445
 446    value.swizzle = composed_swizzle;
 447    if (has_source_modifiers &&
 448        value.type != inst->src[arg].type) {
 449       assert(inst->can_change_types());
 450       for (int i = 0; i < 3; i++) {
 451          inst->src[i].type = value.type;
 452       }
 453       inst->dst.type = value.type;
 454    } else {
 455       value.type = inst->src[arg].type;
 456    }
 457
 458    inst->src[arg] = value;
 459    return true;
 460 }
 461
 462 bool
 463 vec4_visitor::opt_copy_propagation(bool do_constant_prop)
 464 {
 465    /* If we are in dual instanced or single mode, then attributes are going
 466     * to be interleaved, so one register contains two attribute slots.
 467     */
 468    const int attributes_per_reg =
 469       prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 470    bool progress = false;
 471    struct copy_entry entries[alloc.total_size];
 472
 473    memset(&entries, 0, sizeof(entries));
 474
 475    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 476       /* This pass only works on basic blocks.  If there's flow
 477        * control, throw out all our information and start from
 478        * scratch.
 479        *
 480        * This should really be fixed by using a structure like in
 481        * src/glsl/opt_copy_propagation.cpp to track available copies.
 482        */
 483       if (!is_dominated_by_previous_instruction(inst)) {
 484          memset(&entries, 0, sizeof(entries));
 485          continue;
 486       }
 487
 488       /* For each source arg, see if each component comes from a copy
 489        * from the same type file (IMM, VGRF, UNIFORM), and try
 490        * optimizing out access to the copy result
 491        */
 492       for (int i = 2; i >= 0; i--) {
 493          /* Copied values end up in GRFs, and we don't track reladdr
 494           * accesses.
 495           */
 496          if (inst->src[i].file != VGRF ||
 497              inst->src[i].reladdr)
 498             continue;
 499
 500          /* We only handle register-aligned single GRF copies. */
 501          if (inst->size_read(i) != REG_SIZE ||
 502              inst->src[i].offset % REG_SIZE)
 503             continue;
 504
 505          const unsigned reg = (alloc.offsets[inst->src[i].nr] +
 506                                inst->src[i].offset / REG_SIZE);
 507          const copy_entry &entry = entries[reg];
 508
 509          if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
 510             progress = true;
 511          else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
 512             progress = true;
 513       }
 514
 515       /* Track available source registers. */
 516       if (inst->dst.file == VGRF) {
 517          const int reg =
 518             alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
 519
 520          /* Update our destination's current channel values.  For a direct copy,
 521           * the value is the newly propagated source.  Otherwise, we don't know
 522           * the new value, so clear it.
 523           */
 524          bool direct_copy = is_direct_copy(inst);
 525          entries[reg].saturatemask &= ~inst->dst.writemask;
 526          for (int i = 0; i < 4; i++) {
 527             if (inst->dst.writemask & (1 << i)) {
 528                entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
 529                entries[reg].saturatemask |=
 530                   inst->saturate && direct_copy ? 1 << i : 0;
 531             }
 532          }
 533
 534          /* Clear the records for any registers whose current value came from
 535           * our destination's updated channels, as the two are no longer equal.
 536           */
 537          if (inst->dst.reladdr)
 538             memset(&entries, 0, sizeof(entries));
 539          else {
 540             for (unsigned i = 0; i < alloc.total_size; i++) {
 541                for (int j = 0; j < 4; j++) {
 542                   if (is_channel_updated(inst, entries[i].value, j)) {
 543                      entries[i].value[j] = NULL;
 544                      entries[i].saturatemask &= ~(1 << j);
 545                   }
 546                }
 547             }
 548          }
 549       }
 550    }
 551
 552    if (progress)
 553       invalidate_live_intervals();
 554
 555    return progress;
 556 }
 557
 558 } /* namespace brw */