src/intel/compiler/brw_vec4_cse.cpp

   1 /*
   2  * Copyright © 2012, 2013, 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "brw_vec4.h"
  25 #include "brw_vec4_live_variables.h"
  26 #include "brw_cfg.h"
  27
  28 using namespace brw;
  29
  30 /** @file brw_vec4_cse.cpp
  31  *
  32  * Support for local common subexpression elimination.
  33  *
  34  * See Muchnick's Advanced Compiler Design and Implementation, section
  35  * 13.1 (p378).
  36  */
  37
  38 namespace {
  39 struct aeb_entry : public exec_node {
  40    /** The instruction that generates the expression value. */
  41    vec4_instruction *generator;
  42
  43    /** The temporary where the value is stored. */
  44    src_reg tmp;
  45 };
  46 }
  47
  48 static bool
  49 is_expression(const vec4_instruction *const inst)
  50 {
  51    switch (inst->opcode) {
  52    case BRW_OPCODE_MOV:
  53    case BRW_OPCODE_SEL:
  54    case BRW_OPCODE_NOT:
  55    case BRW_OPCODE_AND:
  56    case BRW_OPCODE_OR:
  57    case BRW_OPCODE_XOR:
  58    case BRW_OPCODE_SHR:
  59    case BRW_OPCODE_SHL:
  60    case BRW_OPCODE_ASR:
  61    case BRW_OPCODE_CMP:
  62    case BRW_OPCODE_CMPN:
  63    case BRW_OPCODE_ADD:
  64    case BRW_OPCODE_MUL:
  65    case SHADER_OPCODE_MULH:
  66    case BRW_OPCODE_FRC:
  67    case BRW_OPCODE_RNDU:
  68    case BRW_OPCODE_RNDD:
  69    case BRW_OPCODE_RNDE:
  70    case BRW_OPCODE_RNDZ:
  71    case BRW_OPCODE_LINE:
  72    case BRW_OPCODE_PLN:
  73    case BRW_OPCODE_MAD:
  74    case BRW_OPCODE_LRP:
  75    case VEC4_OPCODE_UNPACK_UNIFORM:
  76    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
  77    case SHADER_OPCODE_BROADCAST:
  78    case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
  79    case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
  80       return true;
  81    case SHADER_OPCODE_RCP:
  82    case SHADER_OPCODE_RSQ:
  83    case SHADER_OPCODE_SQRT:
  84    case SHADER_OPCODE_EXP2:
  85    case SHADER_OPCODE_LOG2:
  86    case SHADER_OPCODE_POW:
  87    case SHADER_OPCODE_INT_QUOTIENT:
  88    case SHADER_OPCODE_INT_REMAINDER:
  89    case SHADER_OPCODE_SIN:
  90    case SHADER_OPCODE_COS:
  91       return inst->mlen == 0;
  92    default:
  93       return false;
  94    }
  95 }
  96
  97 static bool
  98 operands_match(const vec4_instruction *a, const vec4_instruction *b)
  99 {
 100    const src_reg *xs = a->src;
 101    const src_reg *ys = b->src;
 102
 103    if (a->opcode == BRW_OPCODE_MAD) {
 104       return xs[0].equals(ys[0]) &&
 105              ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
 106               (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
 107    } else if (a->opcode == BRW_OPCODE_MOV &&
 108               xs[0].file == IMM &&
 109               xs[0].type == BRW_REGISTER_TYPE_VF) {
 110       src_reg tmp_x = xs[0];
 111       src_reg tmp_y = ys[0];
 112
 113       /* Smash out the values that are not part of the writemask.  Otherwise
 114        * the equals operator will fail due to mismatches in unused components.
 115        */
 116       const unsigned ab_writemask = a->dst.writemask & b->dst.writemask;
 117       const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) |
 118                             ((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) |
 119                             ((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) |
 120                             ((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0);
 121
 122       tmp_x.ud &= mask;
 123       tmp_y.ud &= mask;
 124
 125       return tmp_x.equals(tmp_y);
 126    } else if (!a->is_commutative()) {
 127       return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
 128    } else {
 129       return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
 130              (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
 131    }
 132 }
 133
 134 /**
 135  * Checks if instructions match, exactly for sources, but loosely for
 136  * destination writemasks.
 137  *
 138  * \param 'a' is the generating expression from the AEB entry.
 139  * \param 'b' is the second occurrence of the expression that we're
 140  *        considering eliminating.
 141  */
 142 static bool
 143 instructions_match(vec4_instruction *a, vec4_instruction *b)
 144 {
 145    return a->opcode == b->opcode &&
 146           a->saturate == b->saturate &&
 147           a->predicate == b->predicate &&
 148           a->predicate_inverse == b->predicate_inverse &&
 149           a->conditional_mod == b->conditional_mod &&
 150           a->flag_subreg == b->flag_subreg &&
 151           a->dst.type == b->dst.type &&
 152           a->offset == b->offset &&
 153           a->mlen == b->mlen &&
 154           a->base_mrf == b->base_mrf &&
 155           a->header_size == b->header_size &&
 156           a->shadow_compare == b->shadow_compare &&
 157           ((a->dst.writemask & b->dst.writemask) == a->dst.writemask) &&
 158           a->force_writemask_all == b->force_writemask_all &&
 159           a->size_written == b->size_written &&
 160           a->exec_size == b->exec_size &&
 161           a->group == b->group &&
 162           operands_match(a, b);
 163 }
 164
 165 bool
 166 vec4_visitor::opt_cse_local(bblock_t *block)
 167 {
 168    bool progress = false;
 169    exec_list aeb;
 170
 171    void *cse_ctx = ralloc_context(NULL);
 172
 173    int ip = block->start_ip;
 174    foreach_inst_in_block (vec4_instruction, inst, block) {
 175       /* Skip some cases. */
 176       if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
 177           ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
 178            inst->dst.is_null()))
 179       {
 180          bool found = false;
 181
 182          foreach_in_list_use_after(aeb_entry, entry, &aeb) {
 183             /* Match current instruction's expression against those in AEB. */
 184             if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
 185                 instructions_match(inst, entry->generator)) {
 186                found = true;
 187                progress = true;
 188                break;
 189             }
 190          }
 191
 192          if (!found) {
 193             if (inst->opcode != BRW_OPCODE_MOV ||
 194                 (inst->opcode == BRW_OPCODE_MOV &&
 195                  inst->src[0].file == IMM &&
 196                  inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
 197                /* Our first sighting of this expression.  Create an entry. */
 198                aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
 199                entry->tmp = src_reg(); /* file will be BAD_FILE */
 200                entry->generator = inst;
 201                aeb.push_tail(entry);
 202             }
 203          } else {
 204             /* This is at least our second sighting of this expression.
 205              * If we don't have a temporary already, make one.
 206              */
 207             bool no_existing_temp = entry->tmp.file == BAD_FILE;
 208             if (no_existing_temp && !entry->generator->dst.is_null()) {
 209                entry->tmp = retype(src_reg(VGRF, alloc.allocate(
 210                                               regs_written(entry->generator)),
 211                                            NULL), inst->dst.type);
 212
 213                const unsigned width = entry->generator->exec_size;
 214                unsigned component_size = width * type_sz(entry->tmp.type);
 215                unsigned num_copy_movs =
 216                   DIV_ROUND_UP(entry->generator->size_written, component_size);
 217                for (unsigned i = 0; i < num_copy_movs; ++i) {
 218                   vec4_instruction *copy =
 219                      MOV(offset(entry->generator->dst, width, i),
 220                          offset(entry->tmp, width, i));
 221                   copy->exec_size = width;
 222                   copy->group = entry->generator->group;
 223                   copy->force_writemask_all =
 224                      entry->generator->force_writemask_all;
 225                   entry->generator->insert_after(block, copy);
 226                }
 227
 228                entry->generator->dst = dst_reg(entry->tmp);
 229             }
 230
 231             /* dest <- temp */
 232             if (!inst->dst.is_null()) {
 233                assert(inst->dst.type == entry->tmp.type);
 234                const unsigned width = inst->exec_size;
 235                unsigned component_size = width * type_sz(inst->dst.type);
 236                unsigned num_copy_movs =
 237                   DIV_ROUND_UP(inst->size_written, component_size);
 238                for (unsigned i = 0; i < num_copy_movs; ++i) {
 239                   vec4_instruction *copy =
 240                      MOV(offset(inst->dst, width, i),
 241                          offset(entry->tmp, width, i));
 242                   copy->exec_size = inst->exec_size;
 243                   copy->group = inst->group;
 244                   copy->force_writemask_all = inst->force_writemask_all;
 245                   inst->insert_before(block, copy);
 246                }
 247             }
 248
 249             /* Set our iterator so that next time through the loop inst->next
 250              * will get the instruction in the basic block after the one we've
 251              * removed.
 252              */
 253             vec4_instruction *prev = (vec4_instruction *)inst->prev;
 254
 255             inst->remove(block);
 256             inst = prev;
 257          }
 258       }
 259
 260       foreach_in_list_safe(aeb_entry, entry, &aeb) {
 261          /* Kill all AEB entries that write a different value to or read from
 262           * the flag register if we just wrote it.
 263           */
 264          if (inst->writes_flag()) {
 265             if (entry->generator->reads_flag() ||
 266                 (entry->generator->writes_flag() &&
 267                  !instructions_match(inst, entry->generator))) {
 268                entry->remove();
 269                ralloc_free(entry);
 270                continue;
 271             }
 272          }
 273
 274          for (int i = 0; i < 3; i++) {
 275             src_reg *src = &entry->generator->src[i];
 276
 277             /* Kill all AEB entries that use the destination we just
 278              * overwrote.
 279              */
 280             if (inst->dst.file == entry->generator->src[i].file &&
 281                 inst->dst.nr == entry->generator->src[i].nr) {
 282                entry->remove();
 283                ralloc_free(entry);
 284                break;
 285             }
 286
 287             /* Kill any AEB entries using registers that don't get reused any
 288              * more -- a sure sign they'll fail operands_match().
 289              */
 290             if (src->file == VGRF) {
 291                if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
 292                   entry->remove();
 293                   ralloc_free(entry);
 294                   break;
 295                }
 296             }
 297          }
 298       }
 299
 300       ip++;
 301    }
 302
 303    ralloc_free(cse_ctx);
 304
 305    return progress;
 306 }
 307
 308 bool
 309 vec4_visitor::opt_cse()
 310 {
 311    bool progress = false;
 312
 313    calculate_live_intervals();
 314
 315    foreach_block (block, cfg) {
 316       progress = opt_cse_local(block) || progress;
 317    }
 318
 319    if (progress)
 320       invalidate_analysis(DEPENDENCY_EVERYTHING);
 321
 322    return progress;
 323 }