src/intel/compiler/brw_vec4_copy_propagation.cpp

   1 /*
   2  * Copyright © 2011 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * @file brw_vec4_copy_propagation.cpp
  26  *
  27  * Implements tracking of values copied between registers, and
  28  * optimizations based on that: copy propagation and constant
  29  * propagation.
  30  */
  31
  32 #include "brw_vec4.h"
  33 #include "brw_cfg.h"
  34 #include "brw_eu.h"
  35
  36 namespace brw {
  37
  38 struct copy_entry {
  39    src_reg *value[4];
  40    int saturatemask;
  41 };
  42
  43 static bool
  44 is_direct_copy(vec4_instruction *inst)
  45 {
  46    return (inst->opcode == BRW_OPCODE_MOV &&
  47            !inst->predicate &&
  48            inst->dst.file == VGRF &&
  49            inst->dst.offset % REG_SIZE == 0 &&
  50            !inst->dst.reladdr &&
  51            !inst->src[0].reladdr &&
  52            (inst->dst.type == inst->src[0].type ||
  53             (inst->dst.type == BRW_REGISTER_TYPE_F &&
  54              inst->src[0].type == BRW_REGISTER_TYPE_VF)));
  55 }
  56
  57 static bool
  58 is_dominated_by_previous_instruction(vec4_instruction *inst)
  59 {
  60    return (inst->opcode != BRW_OPCODE_DO &&
  61            inst->opcode != BRW_OPCODE_WHILE &&
  62            inst->opcode != BRW_OPCODE_ELSE &&
  63            inst->opcode != BRW_OPCODE_ENDIF);
  64 }
  65
  66 static bool
  67 is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
  68 {
  69    const src_reg *src = values[ch];
  70
  71    /* consider GRF only */
  72    assert(inst->dst.file == VGRF);
  73    if (!src || src->file != VGRF)
  74       return false;
  75
  76    return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
  77           (inst->dst.offset != src->offset ||
  78            inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
  79 }
  80
  81 static bool
  82 is_logic_op(enum opcode opcode)
  83 {
  84    return (opcode == BRW_OPCODE_AND ||
  85            opcode == BRW_OPCODE_OR  ||
  86            opcode == BRW_OPCODE_XOR ||
  87            opcode == BRW_OPCODE_NOT);
  88 }
  89
  90 /**
  91  * Get the origin of a copy as a single register if all components present in
  92  * the given readmask originate from the same register and have compatible
  93  * regions, otherwise return a BAD_FILE register.
  94  */
  95 static src_reg
  96 get_copy_value(const copy_entry &entry, unsigned readmask)
  97 {
  98    unsigned swz[4] = {};
  99    src_reg value;
 100
 101    for (unsigned i = 0; i < 4; i++) {
 102       if (readmask & (1 << i)) {
 103          if (entry.value[i]) {
 104             src_reg src = *entry.value[i];
 105
 106             if (src.file == IMM) {
 107                swz[i] = i;
 108             } else {
 109                swz[i] = BRW_GET_SWZ(src.swizzle, i);
 110                /* Overwrite the original swizzle so the src_reg::equals call
 111                 * below doesn't care about it, the correct swizzle will be
 112                 * calculated once the swizzles of all components are known.
 113                 */
 114                src.swizzle = BRW_SWIZZLE_XYZW;
 115             }
 116
 117             if (value.file == BAD_FILE) {
 118                value = src;
 119             } else if (!value.equals(src)) {
 120                return src_reg();
 121             }
 122          } else {
 123             return src_reg();
 124          }
 125       }
 126    }
 127
 128    return swizzle(value,
 129                   brw_compose_swizzle(brw_swizzle_for_mask(readmask),
 130                                       BRW_SWIZZLE4(swz[0], swz[1],
 131                                                    swz[2], swz[3])));
 132 }
 133
 134 static bool
 135 try_constant_propagate(const struct gen_device_info *devinfo,
 136                        vec4_instruction *inst,
 137                        int arg, const copy_entry *entry)
 138 {
 139    /* For constant propagation, we only handle the same constant
 140     * across all 4 channels.  Some day, we should handle the 8-bit
 141     * float vector format, which would let us constant propagate
 142     * vectors better.
 143     * We could be more aggressive here -- some channels might not get used
 144     * based on the destination writemask.
 145     */
 146    src_reg value =
 147       get_copy_value(*entry,
 148                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 149                                                    WRITEMASK_XYZW));
 150
 151    if (value.file != IMM)
 152       return false;
 153
 154    /* 64-bit types can't be used except for one-source instructions, which
 155     * higher levels should have constant folded away, so there's no point in
 156     * propagating immediates here.
 157     */
 158    if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
 159       return false;
 160
 161    if (value.type == BRW_REGISTER_TYPE_VF) {
 162       /* The result of bit-casting the component values of a vector float
 163        * cannot in general be represented as an immediate.
 164        */
 165       if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
 166          return false;
 167    } else {
 168       value.type = inst->src[arg].type;
 169    }
 170
 171    if (inst->src[arg].abs) {
 172       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 173           !brw_abs_immediate(value.type, &value.as_brw_reg())) {
 174          return false;
 175       }
 176    }
 177
 178    if (inst->src[arg].negate) {
 179       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 180           !brw_negate_immediate(value.type, &value.as_brw_reg())) {
 181          return false;
 182       }
 183    }
 184
 185    value = swizzle(value, inst->src[arg].swizzle);
 186
 187    switch (inst->opcode) {
 188    case BRW_OPCODE_MOV:
 189    case SHADER_OPCODE_BROADCAST:
 190       inst->src[arg] = value;
 191       return true;
 192
 193    case VEC4_OPCODE_UNTYPED_ATOMIC:
 194       if (arg == 1) {
 195          inst->src[arg] = value;
 196          return true;
 197       }
 198       break;
 199
 200    case SHADER_OPCODE_POW:
 201    case SHADER_OPCODE_INT_QUOTIENT:
 202    case SHADER_OPCODE_INT_REMAINDER:
 203       if (devinfo->gen < 8)
 204          break;
 205       /* fallthrough */
 206    case BRW_OPCODE_DP2:
 207    case BRW_OPCODE_DP3:
 208    case BRW_OPCODE_DP4:
 209    case BRW_OPCODE_DPH:
 210    case BRW_OPCODE_BFI1:
 211    case BRW_OPCODE_ASR:
 212    case BRW_OPCODE_SHL:
 213    case BRW_OPCODE_SHR:
 214    case BRW_OPCODE_SUBB:
 215       if (arg == 1) {
 216          inst->src[arg] = value;
 217          return true;
 218       }
 219       break;
 220
 221    case BRW_OPCODE_MACH:
 222    case BRW_OPCODE_MUL:
 223    case SHADER_OPCODE_MULH:
 224    case BRW_OPCODE_ADD:
 225    case BRW_OPCODE_OR:
 226    case BRW_OPCODE_AND:
 227    case BRW_OPCODE_XOR:
 228    case BRW_OPCODE_ADDC:
 229       if (arg == 1) {
 230          inst->src[arg] = value;
 231          return true;
 232       } else if (arg == 0 && inst->src[1].file != IMM) {
 233          /* Fit this constant in by commuting the operands.  Exception: we
 234           * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
 235           */
 236          if ((inst->opcode == BRW_OPCODE_MUL ||
 237               inst->opcode == BRW_OPCODE_MACH) &&
 238              (inst->src[1].type == BRW_REGISTER_TYPE_D ||
 239               inst->src[1].type == BRW_REGISTER_TYPE_UD))
 240             break;
 241          inst->src[0] = inst->src[1];
 242          inst->src[1] = value;
 243          return true;
 244       }
 245       break;
 246    case GS_OPCODE_SET_WRITE_OFFSET:
 247       /* This is just a multiply by a constant with special strides.
 248        * The generator will handle immediates in both arguments (generating
 249        * a single MOV of the product).  So feel free to propagate in src0.
 250        */
 251       inst->src[arg] = value;
 252       return true;
 253
 254    case BRW_OPCODE_CMP:
 255       if (arg == 1) {
 256          inst->src[arg] = value;
 257          return true;
 258       } else if (arg == 0 && inst->src[1].file != IMM) {
 259          enum brw_conditional_mod new_cmod;
 260
 261          new_cmod = brw_swap_cmod(inst->conditional_mod);
 262          if (new_cmod != BRW_CONDITIONAL_NONE) {
 263             /* Fit this constant in by swapping the operands and
 264              * flipping the test.
 265              */
 266             inst->src[0] = inst->src[1];
 267             inst->src[1] = value;
 268             inst->conditional_mod = new_cmod;
 269             return true;
 270          }
 271       }
 272       break;
 273
 274    case BRW_OPCODE_SEL:
 275       if (arg == 1) {
 276          inst->src[arg] = value;
 277          return true;
 278       } else if (arg == 0 && inst->src[1].file != IMM) {
 279          inst->src[0] = inst->src[1];
 280          inst->src[1] = value;
 281
 282          /* If this was predicated, flipping operands means
 283           * we also need to flip the predicate.
 284           */
 285          if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
 286             inst->predicate_inverse = !inst->predicate_inverse;
 287          }
 288          return true;
 289       }
 290       break;
 291
 292    default:
 293       break;
 294    }
 295
 296    return false;
 297 }
 298
 299 static bool
 300 is_align1_opcode(unsigned opcode)
 301 {
 302    switch (opcode) {
 303    case VEC4_OPCODE_DOUBLE_TO_F32:
 304    case VEC4_OPCODE_DOUBLE_TO_D32:
 305    case VEC4_OPCODE_DOUBLE_TO_U32:
 306    case VEC4_OPCODE_TO_DOUBLE:
 307    case VEC4_OPCODE_PICK_LOW_32BIT:
 308    case VEC4_OPCODE_PICK_HIGH_32BIT:
 309    case VEC4_OPCODE_SET_LOW_32BIT:
 310    case VEC4_OPCODE_SET_HIGH_32BIT:
 311       return true;
 312    default:
 313       return false;
 314    }
 315 }
 316
 317 static bool
 318 try_copy_propagate(const struct gen_device_info *devinfo,
 319                    vec4_instruction *inst, int arg,
 320                    const copy_entry *entry, int attributes_per_reg)
 321 {
 322    /* Build up the value we are propagating as if it were the source of a
 323     * single MOV
 324     */
 325    src_reg value =
 326       get_copy_value(*entry,
 327                      brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
 328                                                    WRITEMASK_XYZW));
 329
 330    /* Check that we can propagate that value */
 331    if (value.file != UNIFORM &&
 332        value.file != VGRF &&
 333        value.file != ATTR)
 334       return false;
 335
 336    /* In gen < 8 instructions that write 2 registers also need to read 2
 337     * registers. Make sure we don't break that restriction by copy
 338     * propagating from a uniform.
 339     */
 340    if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value))
 341       return false;
 342
 343    /* There is a regioning restriction such that if execsize == width
 344     * and hstride != 0 then the vstride can't be 0. When we split instrutions
 345     * that take a single-precision source (like F->DF conversions) we end up
 346     * with a 4-wide source on an instruction with an execution size of 4.
 347     * If we then copy-propagate the source from a uniform we also end up with a
 348     * vstride of 0 and we violate the restriction.
 349     */
 350    if (inst->exec_size == 4 && value.file == UNIFORM &&
 351        type_sz(value.type) == 4)
 352       return false;
 353
 354    /* If the type of the copy value is different from the type of the
 355     * instruction then the swizzles and writemasks involved don't have the same
 356     * meaning and simply replacing the source would produce different semantics.
 357     */
 358    if (type_sz(value.type) != type_sz(inst->src[arg].type))
 359       return false;
 360
 361    if (devinfo->gen >= 8 && (value.negate || value.abs) &&
 362        is_logic_op(inst->opcode)) {
 363       return false;
 364    }
 365
 366    if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
 367       return false;
 368
 369    bool has_source_modifiers = value.negate || value.abs;
 370
 371    /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
 372     * instructions.
 373     */
 374    if ((has_source_modifiers || value.file == UNIFORM ||
 375         value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
 376       return false;
 377
 378    if (has_source_modifiers &&
 379        value.type != inst->src[arg].type &&
 380        !inst->can_change_types())
 381       return false;
 382
 383    if (has_source_modifiers &&
 384        (inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
 385         inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT))
 386       return false;
 387
 388    unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
 389                                                    value.swizzle);
 390
 391    /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
 392     * so copy-propagation won't be safe if the composed swizzle is anything
 393     * other than the identity.
 394     */
 395    if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
 396       return false;
 397
 398    if (inst->is_3src(devinfo) &&
 399        (value.file == UNIFORM ||
 400         (value.file == ATTR && attributes_per_reg != 1)) &&
 401        !brw_is_single_value_swizzle(composed_swizzle))
 402       return false;
 403
 404    if (inst->is_send_from_grf())
 405       return false;
 406
 407    /* we can't generally copy-propagate UD negations becuse we
 408     * end up accessing the resulting values as signed integers
 409     * instead. See also resolve_ud_negate().
 410     */
 411    if (value.negate &&
 412        value.type == BRW_REGISTER_TYPE_UD)
 413       return false;
 414
 415    /* Don't report progress if this is a noop. */
 416    if (value.equals(inst->src[arg]))
 417       return false;
 418
 419    const unsigned dst_saturate_mask = inst->dst.writemask &
 420       brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
 421
 422    if (dst_saturate_mask) {
 423       /* We either saturate all or nothing. */
 424       if (dst_saturate_mask != inst->dst.writemask)
 425          return false;
 426
 427       /* Limit saturate propagation only to SEL with src1 bounded within 0.0
 428        * and 1.0, otherwise skip copy propagate altogether.
 429        */
 430       switch(inst->opcode) {
 431       case BRW_OPCODE_SEL:
 432          if (arg != 0 ||
 433              inst->src[0].type != BRW_REGISTER_TYPE_F ||
 434              inst->src[1].file != IMM ||
 435              inst->src[1].type != BRW_REGISTER_TYPE_F ||
 436              inst->src[1].f < 0.0 ||
 437              inst->src[1].f > 1.0) {
 438             return false;
 439          }
 440          if (!inst->saturate)
 441             inst->saturate = true;
 442          break;
 443       default:
 444          return false;
 445       }
 446    }
 447
 448    /* Build the final value */
 449    if (inst->src[arg].abs) {
 450       value.negate = false;
 451       value.abs = true;
 452    }
 453    if (inst->src[arg].negate)
 454       value.negate = !value.negate;
 455
 456    value.swizzle = composed_swizzle;
 457    if (has_source_modifiers &&
 458        value.type != inst->src[arg].type) {
 459       assert(inst->can_change_types());
 460       for (int i = 0; i < 3; i++) {
 461          inst->src[i].type = value.type;
 462       }
 463       inst->dst.type = value.type;
 464    } else {
 465       value.type = inst->src[arg].type;
 466    }
 467
 468    inst->src[arg] = value;
 469    return true;
 470 }
 471
 472 bool
 473 vec4_visitor::opt_copy_propagation(bool do_constant_prop)
 474 {
 475    /* If we are in dual instanced or single mode, then attributes are going
 476     * to be interleaved, so one register contains two attribute slots.
 477     */
 478    const int attributes_per_reg =
 479       prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 480    bool progress = false;
 481    struct copy_entry entries[alloc.total_size];
 482
 483    memset(&entries, 0, sizeof(entries));
 484
 485    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
 486       /* This pass only works on basic blocks.  If there's flow
 487        * control, throw out all our information and start from
 488        * scratch.
 489        *
 490        * This should really be fixed by using a structure like in
 491        * src/glsl/opt_copy_propagation.cpp to track available copies.
 492        */
 493       if (!is_dominated_by_previous_instruction(inst)) {
 494          memset(&entries, 0, sizeof(entries));
 495          continue;
 496       }
 497
 498       /* For each source arg, see if each component comes from a copy
 499        * from the same type file (IMM, VGRF, UNIFORM), and try
 500        * optimizing out access to the copy result
 501        */
 502       for (int i = 2; i >= 0; i--) {
 503          /* Copied values end up in GRFs, and we don't track reladdr
 504           * accesses.
 505           */
 506          if (inst->src[i].file != VGRF ||
 507              inst->src[i].reladdr)
 508             continue;
 509
 510          /* We only handle register-aligned single GRF copies. */
 511          if (inst->size_read(i) != REG_SIZE ||
 512              inst->src[i].offset % REG_SIZE)
 513             continue;
 514
 515          const unsigned reg = (alloc.offsets[inst->src[i].nr] +
 516                                inst->src[i].offset / REG_SIZE);
 517          const copy_entry &entry = entries[reg];
 518
 519          if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
 520             progress = true;
 521          else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
 522             progress = true;
 523       }
 524
 525       /* Track available source registers. */
 526       if (inst->dst.file == VGRF) {
 527          const int reg =
 528             alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
 529
 530          /* Update our destination's current channel values.  For a direct copy,
 531           * the value is the newly propagated source.  Otherwise, we don't know
 532           * the new value, so clear it.
 533           */
 534          bool direct_copy = is_direct_copy(inst);
 535          entries[reg].saturatemask &= ~inst->dst.writemask;
 536          for (int i = 0; i < 4; i++) {
 537             if (inst->dst.writemask & (1 << i)) {
 538                entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
 539                entries[reg].saturatemask |=
 540                   inst->saturate && direct_copy ? 1 << i : 0;
 541             }
 542          }
 543
 544          /* Clear the records for any registers whose current value came from
 545           * our destination's updated channels, as the two are no longer equal.
 546           */
 547          if (inst->dst.reladdr)
 548             memset(&entries, 0, sizeof(entries));
 549          else {
 550             for (unsigned i = 0; i < alloc.total_size; i++) {
 551                for (int j = 0; j < 4; j++) {
 552                   if (is_channel_updated(inst, entries[i].value, j)) {
 553                      entries[i].value[j] = NULL;
 554                      entries[i].saturatemask &= ~(1 << j);
 555                   }
 556                }
 557             }
 558          }
 559       }
 560    }
 561
 562    if (progress)
 563       invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
 564                           DEPENDENCY_INSTRUCTION_DETAIL);
 565
 566    return progress;
 567 }
 568
 569 } /* namespace brw */