src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_copy_propagation.cpp
  25  *
  26  * Support for global copy propagation in two passes: A local pass that does
  27  * intra-block copy (and constant) propagation, and a global pass that uses
  28  * dataflow analysis on the copies available at the end of each block to re-do
  29  * local copy propagation with more copies available.
  30  *
  31  * See Muchnick's Advanced Compiler Design and Implementation, section
  32  * 12.5 (p356).
  33  */
  34
  35 #define ACP_HASH_SIZE 16
  36
  37 #include "util/bitset.h"
  38 #include "brw_fs.h"
  39 #include "brw_cfg.h"
  40 #include "brw_eu.h"
  41
  42 namespace { /* avoid conflict with opt_copy_propagation_elements */
  43 struct acp_entry : public exec_node {
  44    fs_reg dst;
  45    fs_reg src;
  46    uint8_t regs_written;
  47    enum opcode opcode;
  48    bool saturate;
  49 };
  50
  51 struct block_data {
  52    /**
  53     * Which entries in the fs_copy_prop_dataflow acp table are live at the
  54     * start of this block.  This is the useful output of the analysis, since
  55     * it lets us plug those into the local copy propagation on the second
  56     * pass.
  57     */
  58    BITSET_WORD *livein;
  59
  60    /**
  61     * Which entries in the fs_copy_prop_dataflow acp table are live at the end
  62     * of this block.  This is done in initial setup from the per-block acps
  63     * returned by the first local copy prop pass.
  64     */
  65    BITSET_WORD *liveout;
  66
  67    /**
  68     * Which entries in the fs_copy_prop_dataflow acp table are generated by
  69     * instructions in this block which reach the end of the block without
  70     * being killed.
  71     */
  72    BITSET_WORD *copy;
  73
  74    /**
  75     * Which entries in the fs_copy_prop_dataflow acp table are killed over the
  76     * course of this block.
  77     */
  78    BITSET_WORD *kill;
  79 };
  80
  81 class fs_copy_prop_dataflow
  82 {
  83 public:
  84    fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
  85                          exec_list *out_acp[ACP_HASH_SIZE]);
  86
  87    void setup_initial_values();
  88    void run();
  89
  90    void dump_block_data() const UNUSED;
  91
  92    void *mem_ctx;
  93    cfg_t *cfg;
  94
  95    acp_entry **acp;
  96    int num_acp;
  97    int bitset_words;
  98
  99   struct block_data *bd;
 100 };
 101 } /* anonymous namespace */
 102
 103 fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
 104                                              exec_list *out_acp[ACP_HASH_SIZE])
 105    : mem_ctx(mem_ctx), cfg(cfg)
 106 {
 107    bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
 108
 109    num_acp = 0;
 110    foreach_block (block, cfg) {
 111       for (int i = 0; i < ACP_HASH_SIZE; i++) {
 112          num_acp += out_acp[block->num][i].length();
 113       }
 114    }
 115
 116    acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp);
 117
 118    bitset_words = BITSET_WORDS(num_acp);
 119
 120    int next_acp = 0;
 121    foreach_block (block, cfg) {
 122       bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
 123       bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
 124       bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
 125       bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
 126
 127       for (int i = 0; i < ACP_HASH_SIZE; i++) {
 128          foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
 129             acp[next_acp] = entry;
 130
 131             /* opt_copy_propagate_local populates out_acp with copies created
 132              * in a block which are still live at the end of the block.  This
 133              * is exactly what we want in the COPY set.
 134              */
 135             BITSET_SET(bd[block->num].copy, next_acp);
 136
 137             next_acp++;
 138          }
 139       }
 140    }
 141
 142    assert(next_acp == num_acp);
 143
 144    setup_initial_values();
 145    run();
 146 }
 147
 148 /**
 149  * Set up initial values for each of the data flow sets, prior to running
 150  * the fixed-point algorithm.
 151  */
 152 void
 153 fs_copy_prop_dataflow::setup_initial_values()
 154 {
 155    /* Initialize the COPY and KILL sets. */
 156    foreach_block (block, cfg) {
 157       foreach_inst_in_block(fs_inst, inst, block) {
 158          if (inst->dst.file != VGRF)
 159             continue;
 160
 161          /* Mark ACP entries which are killed by this instruction. */
 162          for (int i = 0; i < num_acp; i++) {
 163             if (inst->overwrites_reg(acp[i]->dst) ||
 164                 inst->overwrites_reg(acp[i]->src)) {
 165                BITSET_SET(bd[block->num].kill, i);
 166             }
 167          }
 168       }
 169    }
 170
 171    /* Populate the initial values for the livein and liveout sets.  For the
 172     * block at the start of the program, livein = 0 and liveout = copy.
 173     * For the others, set liveout to 0 (the empty set) and livein to ~0
 174     * (the universal set).
 175     */
 176    foreach_block (block, cfg) {
 177       if (block->parents.is_empty()) {
 178          for (int i = 0; i < bitset_words; i++) {
 179             bd[block->num].livein[i] = 0u;
 180             bd[block->num].liveout[i] = bd[block->num].copy[i];
 181          }
 182       } else {
 183          for (int i = 0; i < bitset_words; i++) {
 184             bd[block->num].liveout[i] = 0u;
 185             bd[block->num].livein[i] = ~0u;
 186          }
 187       }
 188    }
 189 }
 190
 191 /**
 192  * Walk the set of instructions in the block, marking which entries in the acp
 193  * are killed by the block.
 194  */
 195 void
 196 fs_copy_prop_dataflow::run()
 197 {
 198    bool progress;
 199
 200    do {
 201       progress = false;
 202
 203       /* Update liveout for all blocks. */
 204       foreach_block (block, cfg) {
 205          if (block->parents.is_empty())
 206             continue;
 207
 208          for (int i = 0; i < bitset_words; i++) {
 209             const BITSET_WORD old_liveout = bd[block->num].liveout[i];
 210
 211             bd[block->num].liveout[i] =
 212                bd[block->num].copy[i] | (bd[block->num].livein[i] &
 213                                          ~bd[block->num].kill[i]);
 214
 215             if (old_liveout != bd[block->num].liveout[i])
 216                progress = true;
 217          }
 218       }
 219
 220       /* Update livein for all blocks.  If a copy is live out of all parent
 221        * blocks, it's live coming in to this block.
 222        */
 223       foreach_block (block, cfg) {
 224          if (block->parents.is_empty())
 225             continue;
 226
 227          for (int i = 0; i < bitset_words; i++) {
 228             const BITSET_WORD old_livein = bd[block->num].livein[i];
 229
 230             bd[block->num].livein[i] = ~0u;
 231             foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
 232                bblock_t *parent = parent_link->block;
 233                bd[block->num].livein[i] &= bd[parent->num].liveout[i];
 234             }
 235
 236             if (old_livein != bd[block->num].livein[i])
 237                progress = true;
 238          }
 239       }
 240    } while (progress);
 241 }
 242
 243 void
 244 fs_copy_prop_dataflow::dump_block_data() const
 245 {
 246    foreach_block (block, cfg) {
 247       fprintf(stderr, "Block %d [%d, %d] (parents ", block->num,
 248              block->start_ip, block->end_ip);
 249       foreach_list_typed(bblock_link, link, link, &block->parents) {
 250          bblock_t *parent = link->block;
 251          fprintf(stderr, "%d ", parent->num);
 252       }
 253       fprintf(stderr, "):\n");
 254       fprintf(stderr, "       livein = 0x");
 255       for (int i = 0; i < bitset_words; i++)
 256          fprintf(stderr, "%08x", bd[block->num].livein[i]);
 257       fprintf(stderr, ", liveout = 0x");
 258       for (int i = 0; i < bitset_words; i++)
 259          fprintf(stderr, "%08x", bd[block->num].liveout[i]);
 260       fprintf(stderr, ",\n       copy   = 0x");
 261       for (int i = 0; i < bitset_words; i++)
 262          fprintf(stderr, "%08x", bd[block->num].copy[i]);
 263       fprintf(stderr, ", kill    = 0x");
 264       for (int i = 0; i < bitset_words; i++)
 265          fprintf(stderr, "%08x", bd[block->num].kill[i]);
 266       fprintf(stderr, "\n");
 267    }
 268 }
 269
 270 static bool
 271 is_logic_op(enum opcode opcode)
 272 {
 273    return (opcode == BRW_OPCODE_AND ||
 274            opcode == BRW_OPCODE_OR  ||
 275            opcode == BRW_OPCODE_XOR ||
 276            opcode == BRW_OPCODE_NOT);
 277 }
 278
 279 static bool
 280 can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
 281                 const brw_device_info *devinfo)
 282 {
 283    if (stride > 4)
 284       return false;
 285
 286    /* 3-source instructions can only be Align16, which restricts what strides
 287     * they can take. They can only take a stride of 1 (the usual case), or 0
 288     * with a special "repctrl" bit. But the repctrl bit doesn't work for
 289     * 64-bit datatypes, so if the source type is 64-bit then only a stride of
 290     * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page
 291     * 944:
 292     *
 293     *    This is applicable to 32b datatypes and 16b datatype. 64b datatypes
 294     *    cannot use the replicate control.
 295     */
 296    if (inst->is_3src()) {
 297       if (type_sz(inst->src[arg].type) > 4)
 298          return stride == 1;
 299       else
 300          return stride == 1 || stride == 0;
 301    }
 302
 303    /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions",
 304     * page 391 ("Extended Math Function"):
 305     *
 306     *     The following restrictions apply for align1 mode: Scalar source is
 307     *     supported. Source and destination horizontal stride must be the
 308     *     same.
 309     *
 310     * From the Haswell PRM Volume 2b "Command Reference - Instructions", page
 311     * 134 ("Extended Math Function"):
 312     *
 313     *    Scalar source is supported. Source and destination horizontal stride
 314     *    must be 1.
 315     *
 316     * and similar language exists for IVB and SNB. Pre-SNB, math instructions
 317     * are sends, so the sources are moved to MRF's and there are no
 318     * restrictions.
 319     */
 320    if (inst->is_math()) {
 321       if (devinfo->gen == 6 || devinfo->gen == 7) {
 322          assert(inst->dst.stride == 1);
 323          return stride == 1 || stride == 0;
 324       } else if (devinfo->gen >= 8) {
 325          return stride == inst->dst.stride || stride == 0;
 326       }
 327    }
 328
 329    return true;
 330 }
 331
 332 bool
 333 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 334 {
 335    if (inst->src[arg].file != VGRF)
 336       return false;
 337
 338    if (entry->src.file == IMM)
 339       return false;
 340    assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
 341           entry->src.file == ATTR);
 342
 343    if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
 344        inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
 345       return false;
 346
 347    assert(entry->dst.file == VGRF);
 348    if (inst->src[arg].nr != entry->dst.nr)
 349       return false;
 350
 351    /* Bail if inst is reading a range that isn't contained in the range
 352     * that entry is writing.
 353     */
 354    if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
 355        (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
 356         inst->regs_read(arg) * inst->src[arg].stride * 32) >
 357        (entry->dst.reg_offset + entry->regs_written) * 32)
 358       return false;
 359
 360    /* we can't generally copy-propagate UD negations because we
 361     * can end up accessing the resulting values as signed integers
 362     * instead. See also resolve_ud_negate() and comment in
 363     * fs_generator::generate_code.
 364     */
 365    if (entry->src.type == BRW_REGISTER_TYPE_UD &&
 366        entry->src.negate)
 367       return false;
 368
 369    bool has_source_modifiers = entry->src.abs || entry->src.negate;
 370
 371    if ((has_source_modifiers || entry->src.file == UNIFORM ||
 372         !entry->src.is_contiguous()) &&
 373        !inst->can_do_source_mods(devinfo))
 374       return false;
 375
 376    if (has_source_modifiers &&
 377        inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
 378       return false;
 379
 380    /* Bail if the result of composing both strides would exceed the
 381     * hardware limit.
 382     */
 383    if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
 384                         devinfo))
 385       return false;
 386
 387    /* Bail if the instruction type is larger than the execution type of the
 388     * copy, what implies that each channel is reading multiple channels of the
 389     * destination of the copy, and simply replacing the sources would give a
 390     * program with different semantics.
 391     */
 392    if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
 393       return false;
 394
 395    /* Bail if the result of composing both strides cannot be expressed
 396     * as another stride. This avoids, for example, trying to transform
 397     * this:
 398     *
 399     *     MOV (8) rX<1>UD rY<0;1,0>UD
 400     *     FOO (8) ...     rX<8;8,1>UW
 401     *
 402     * into this:
 403     *
 404     *     FOO (8) ...     rY<0;1,0>UW
 405     *
 406     * Which would have different semantics.
 407     */
 408    if (entry->src.stride != 1 &&
 409        (inst->src[arg].stride *
 410         type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
 411       return false;
 412
 413    if (has_source_modifiers &&
 414        entry->dst.type != inst->src[arg].type &&
 415        !inst->can_change_types())
 416       return false;
 417
 418    if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) &&
 419        is_logic_op(inst->opcode)) {
 420       return false;
 421    }
 422
 423    if (entry->saturate) {
 424       switch(inst->opcode) {
 425       case BRW_OPCODE_SEL:
 426          if (inst->src[1].file != IMM ||
 427              inst->src[1].f < 0.0 ||
 428              inst->src[1].f > 1.0) {
 429             return false;
 430          }
 431          break;
 432       default:
 433          return false;
 434       }
 435    }
 436
 437    inst->src[arg].file = entry->src.file;
 438    inst->src[arg].nr = entry->src.nr;
 439    inst->src[arg].stride *= entry->src.stride;
 440    inst->saturate = inst->saturate || entry->saturate;
 441
 442    switch (entry->src.file) {
 443    case UNIFORM:
 444    case BAD_FILE:
 445    case ARF:
 446    case FIXED_GRF:
 447       inst->src[arg].reg_offset = entry->src.reg_offset;
 448       inst->src[arg].subreg_offset = entry->src.subreg_offset;
 449       break;
 450    case ATTR:
 451    case VGRF:
 452       {
 453          /* In this case, we'll just leave the width alone.  The source
 454           * register could have different widths depending on how it is
 455           * being used.  For instance, if only half of the register was
 456           * used then we want to preserve that and continue to only use
 457           * half.
 458           *
 459           * Also, we have to deal with mapping parts of vgrfs to other
 460           * parts of vgrfs so we have to do some reg_offset magic.
 461           */
 462
 463          /* Compute the offset of inst->src[arg] relative to inst->dst */
 464          assert(entry->dst.subreg_offset == 0);
 465          int rel_offset = inst->src[arg].reg_offset - entry->dst.reg_offset;
 466          int rel_suboffset = inst->src[arg].subreg_offset;
 467
 468          /* Compute the final register offset (in bytes) */
 469          int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
 470          offset += rel_offset * 32 + rel_suboffset;
 471          inst->src[arg].reg_offset = offset / 32;
 472          inst->src[arg].subreg_offset = offset % 32;
 473       }
 474       break;
 475
 476    case MRF:
 477    case IMM:
 478       unreachable("not reached");
 479    }
 480
 481    if (has_source_modifiers) {
 482       if (entry->dst.type != inst->src[arg].type) {
 483          /* We are propagating source modifiers from a MOV with a different
 484           * type.  If we got here, then we can just change the source and
 485           * destination types of the instruction and keep going.
 486           */
 487          assert(inst->can_change_types());
 488          for (int i = 0; i < inst->sources; i++) {
 489             inst->src[i].type = entry->dst.type;
 490          }
 491          inst->dst.type = entry->dst.type;
 492       }
 493
 494       if (!inst->src[arg].abs) {
 495          inst->src[arg].abs = entry->src.abs;
 496          inst->src[arg].negate ^= entry->src.negate;
 497       }
 498    }
 499
 500    return true;
 501 }
 502
 503
 504 bool
 505 fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 506 {
 507    bool progress = false;
 508
 509    if (entry->src.file != IMM)
 510       return false;
 511    if (entry->saturate)
 512       return false;
 513
 514    for (int i = inst->sources - 1; i >= 0; i--) {
 515       if (inst->src[i].file != VGRF)
 516          continue;
 517
 518       assert(entry->dst.file == VGRF);
 519       if (inst->src[i].nr != entry->dst.nr)
 520          continue;
 521
 522       /* Bail if inst is reading a range that isn't contained in the range
 523        * that entry is writing.
 524        */
 525       if (inst->src[i].reg_offset < entry->dst.reg_offset ||
 526           (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
 527            inst->regs_read(i) * inst->src[i].stride * 32) >
 528           (entry->dst.reg_offset + entry->regs_written) * 32)
 529          continue;
 530
 531       fs_reg val = entry->src;
 532       val.type = inst->src[i].type;
 533
 534       if (inst->src[i].abs) {
 535          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 536              !brw_abs_immediate(val.type, &val.as_brw_reg())) {
 537             continue;
 538          }
 539       }
 540
 541       if (inst->src[i].negate) {
 542          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
 543              !brw_negate_immediate(val.type, &val.as_brw_reg())) {
 544             continue;
 545          }
 546       }
 547
 548       switch (inst->opcode) {
 549       case BRW_OPCODE_MOV:
 550       case SHADER_OPCODE_LOAD_PAYLOAD:
 551          inst->src[i] = val;
 552          progress = true;
 553          break;
 554
 555       case SHADER_OPCODE_INT_QUOTIENT:
 556       case SHADER_OPCODE_INT_REMAINDER:
 557          /* FINISHME: Promote non-float constants and remove this. */
 558          if (devinfo->gen < 8)
 559             break;
 560          /* fallthrough */
 561       case SHADER_OPCODE_POW:
 562          /* Allow constant propagation into src1 (except on Gen 6), and let
 563           * constant combining promote the constant on Gen < 8.
 564           *
 565           * While Gen 6 MATH can take a scalar source, its source and
 566           * destination offsets must be equal and we cannot ensure that.
 567           */
 568          if (devinfo->gen == 6)
 569             break;
 570          /* fallthrough */
 571       case BRW_OPCODE_BFI1:
 572       case BRW_OPCODE_ASR:
 573       case BRW_OPCODE_SHL:
 574       case BRW_OPCODE_SHR:
 575       case BRW_OPCODE_SUBB:
 576          if (i == 1) {
 577             inst->src[i] = val;
 578             progress = true;
 579          }
 580          break;
 581
 582       case BRW_OPCODE_MACH:
 583       case BRW_OPCODE_MUL:
 584       case SHADER_OPCODE_MULH:
 585       case BRW_OPCODE_ADD:
 586       case BRW_OPCODE_OR:
 587       case BRW_OPCODE_AND:
 588       case BRW_OPCODE_XOR:
 589       case BRW_OPCODE_ADDC:
 590          if (i == 1) {
 591             inst->src[i] = val;
 592             progress = true;
 593          } else if (i == 0 && inst->src[1].file != IMM) {
 594             /* Fit this constant in by commuting the operands.
 595              * Exception: we can't do this for 32-bit integer MUL/MACH
 596              * because it's asymmetric.
 597              *
 598              * The BSpec says for Broadwell that
 599              *
 600              *    "When multiplying DW x DW, the dst cannot be accumulator."
 601              *
 602              * Integer MUL with a non-accumulator destination will be lowered
 603              * by lower_integer_multiplication(), so don't restrict it.
 604              */
 605             if (((inst->opcode == BRW_OPCODE_MUL &&
 606                   inst->dst.is_accumulator()) ||
 607                  inst->opcode == BRW_OPCODE_MACH) &&
 608                 (inst->src[1].type == BRW_REGISTER_TYPE_D ||
 609                  inst->src[1].type == BRW_REGISTER_TYPE_UD))
 610                break;
 611             inst->src[0] = inst->src[1];
 612             inst->src[1] = val;
 613             progress = true;
 614          }
 615          break;
 616
 617       case BRW_OPCODE_CMP:
 618       case BRW_OPCODE_IF:
 619          if (i == 1) {
 620             inst->src[i] = val;
 621             progress = true;
 622          } else if (i == 0 && inst->src[1].file != IMM) {
 623             enum brw_conditional_mod new_cmod;
 624
 625             new_cmod = brw_swap_cmod(inst->conditional_mod);
 626             if (new_cmod != BRW_CONDITIONAL_NONE) {
 627                /* Fit this constant in by swapping the operands and
 628                 * flipping the test
 629                 */
 630                inst->src[0] = inst->src[1];
 631                inst->src[1] = val;
 632                inst->conditional_mod = new_cmod;
 633                progress = true;
 634             }
 635          }
 636          break;
 637
 638       case BRW_OPCODE_SEL:
 639          if (i == 1) {
 640             inst->src[i] = val;
 641             progress = true;
 642          } else if (i == 0 && inst->src[1].file != IMM) {
 643             inst->src[0] = inst->src[1];
 644             inst->src[1] = val;
 645
 646             /* If this was predicated, flipping operands means
 647              * we also need to flip the predicate.
 648              */
 649             if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
 650                inst->predicate_inverse =
 651                   !inst->predicate_inverse;
 652             }
 653             progress = true;
 654          }
 655          break;
 656
 657       case SHADER_OPCODE_UNTYPED_ATOMIC:
 658       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 659       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 660       case SHADER_OPCODE_TYPED_ATOMIC:
 661       case SHADER_OPCODE_TYPED_SURFACE_READ:
 662       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 663          /* We only propagate into the surface argument of the
 664           * instruction. Everything else goes through LOAD_PAYLOAD.
 665           */
 666          if (i == 1) {
 667             inst->src[i] = val;
 668             progress = true;
 669          }
 670          break;
 671
 672       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 673       case SHADER_OPCODE_BROADCAST:
 674          inst->src[i] = val;
 675          progress = true;
 676          break;
 677
 678       case BRW_OPCODE_MAD:
 679       case BRW_OPCODE_LRP:
 680          inst->src[i] = val;
 681          progress = true;
 682          break;
 683
 684       default:
 685          break;
 686       }
 687    }
 688
 689    return progress;
 690 }
 691
 692 static bool
 693 can_propagate_from(fs_inst *inst)
 694 {
 695    return (inst->opcode == BRW_OPCODE_MOV &&
 696            inst->dst.file == VGRF &&
 697            ((inst->src[0].file == VGRF &&
 698              (inst->src[0].nr != inst->dst.nr ||
 699               inst->src[0].reg_offset != inst->dst.reg_offset)) ||
 700             inst->src[0].file == ATTR ||
 701             inst->src[0].file == UNIFORM ||
 702             inst->src[0].file == IMM) &&
 703            inst->src[0].type == inst->dst.type &&
 704            !inst->is_partial_write());
 705 }
 706
 707 /* Walks a basic block and does copy propagation on it using the acp
 708  * list.
 709  */
 710 bool
 711 fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
 712                                      exec_list *acp)
 713 {
 714    bool progress = false;
 715
 716    foreach_inst_in_block(fs_inst, inst, block) {
 717       /* Try propagating into this instruction. */
 718       for (int i = 0; i < inst->sources; i++) {
 719          if (inst->src[i].file != VGRF)
 720             continue;
 721
 722          foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
 723             if (try_constant_propagate(inst, entry))
 724                progress = true;
 725             else if (try_copy_propagate(inst, i, entry))
 726                progress = true;
 727          }
 728       }
 729
 730       /* kill the destination from the ACP */
 731       if (inst->dst.file == VGRF) {
 732          foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
 733             if (inst->overwrites_reg(entry->dst)) {
 734                entry->remove();
 735             }
 736          }
 737
 738          /* Oops, we only have the chaining hash based on the destination, not
 739           * the source, so walk across the entire table.
 740           */
 741          for (int i = 0; i < ACP_HASH_SIZE; i++) {
 742             foreach_in_list_safe(acp_entry, entry, &acp[i]) {
 743                if (inst->overwrites_reg(entry->src))
 744                   entry->remove();
 745             }
 746          }
 747       }
 748
 749       /* If this instruction's source could potentially be folded into the
 750        * operand of another instruction, add it to the ACP.
 751        */
 752       if (can_propagate_from(inst)) {
 753          acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
 754          entry->dst = inst->dst;
 755          entry->src = inst->src[0];
 756          entry->regs_written = inst->regs_written;
 757          entry->opcode = inst->opcode;
 758          entry->saturate = inst->saturate;
 759          acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
 760       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
 761                  inst->dst.file == VGRF) {
 762          int offset = 0;
 763          for (int i = 0; i < inst->sources; i++) {
 764             int effective_width = i < inst->header_size ? 8 : inst->exec_size;
 765             int regs_written = effective_width / 8;
 766             if (inst->src[i].file == VGRF) {
 767                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
 768                entry->dst = inst->dst;
 769                entry->dst.reg_offset = offset;
 770                entry->src = inst->src[i];
 771                entry->regs_written = regs_written;
 772                entry->opcode = inst->opcode;
 773                if (!entry->dst.equals(inst->src[i])) {
 774                   acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
 775                } else {
 776                   ralloc_free(entry);
 777                }
 778             }
 779             offset += regs_written;
 780          }
 781       }
 782    }
 783
 784    return progress;
 785 }
 786
 787 bool
 788 fs_visitor::opt_copy_propagate()
 789 {
 790    bool progress = false;
 791    void *copy_prop_ctx = ralloc_context(NULL);
 792    exec_list *out_acp[cfg->num_blocks];
 793
 794    for (int i = 0; i < cfg->num_blocks; i++)
 795       out_acp[i] = new exec_list [ACP_HASH_SIZE];
 796
 797    /* First, walk through each block doing local copy propagation and getting
 798     * the set of copies available at the end of the block.
 799     */
 800    foreach_block (block, cfg) {
 801       progress = opt_copy_propagate_local(copy_prop_ctx, block,
 802                                           out_acp[block->num]) || progress;
 803    }
 804
 805    /* Do dataflow analysis for those available copies. */
 806    fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
 807
 808    /* Next, re-run local copy propagation, this time with the set of copies
 809     * provided by the dataflow analysis available at the start of a block.
 810     */
 811    foreach_block (block, cfg) {
 812       exec_list in_acp[ACP_HASH_SIZE];
 813
 814       for (int i = 0; i < dataflow.num_acp; i++) {
 815          if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
 816             struct acp_entry *entry = dataflow.acp[i];
 817             in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
 818          }
 819       }
 820
 821       progress = opt_copy_propagate_local(copy_prop_ctx, block, in_acp) || progress;
 822    }
 823
 824    for (int i = 0; i < cfg->num_blocks; i++)
 825       delete [] out_acp[i];
 826    ralloc_free(copy_prop_ctx);
 827
 828    if (progress)
 829       invalidate_live_intervals();
 830
 831    return progress;
 832 }