src/compiler/nir/nir_opt_comparison_pre.c

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir_instr_set.h"
  25 #include "nir_search_helpers.h"
  26 #include "nir_builder.h"
  27 #include "util/u_vector.h"
  28
  29 /* Partial redundancy elimination of compares
  30  *
  31  * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
  32  * instructions like 'b - a'.  The comparison is replaced by the arithmetic
  33  * instruction, and the result is compared with zero.  For example,
  34  *
  35  *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
  36  *       if ssa_111 {
  37  *               block block_1:
  38  *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
  39  *              ...
  40  *
  41  * becomes
  42  *
  43  *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
  44  *       vec1 32 ssa_112 = flt 0.0, ssa_111
  45  *       if ssa_112 {
  46  *               block block_1:
  47  *              ...
  48  */
  49
  50 struct block_queue {
  51    /**
  52     * Stack of blocks from the current location in the CFG to the entry point
  53     * of the function.
  54     *
  55     * This is sort of a poor man's dominator tree.
  56     */
  57    struct exec_list blocks;
  58
  59    /** List of freed block_instructions structures that can be reused. */
  60    struct exec_list reusable_blocks;
  61 };
  62
  63 struct block_instructions {
  64    struct exec_node node;
  65
  66    /**
  67     * Set of comparison instructions from the block that are candidates for
  68     * being replaced by add instructions.
  69     */
  70    struct u_vector instructions;
  71 };
  72
  73 static void
  74 block_queue_init(struct block_queue *bq)
  75 {
  76    exec_list_make_empty(&bq->blocks);
  77    exec_list_make_empty(&bq->reusable_blocks);
  78 }
  79
  80 static void
  81 block_queue_finish(struct block_queue *bq)
  82 {
  83    struct block_instructions *n;
  84
  85    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
  86       u_vector_finish(&n->instructions);
  87       free(n);
  88    }
  89
  90    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
  91       free(n);
  92    }
  93 }
  94
  95 static struct block_instructions *
  96 push_block(struct block_queue *bq)
  97 {
  98    struct block_instructions *bi =
  99       (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
 100
 101    if (bi == NULL) {
 102       bi = calloc(1, sizeof(struct block_instructions));
 103
 104       if (bi == NULL)
 105          return NULL;
 106    }
 107
 108    if (!u_vector_init(&bi->instructions,
 109                       sizeof(nir_alu_instr *),
 110                       8 * sizeof(nir_alu_instr *))) {
 111       free(bi);
 112       return NULL;
 113    }
 114
 115    exec_list_push_tail(&bq->blocks, &bi->node);
 116
 117    return bi;
 118 }
 119
 120 static void
 121 pop_block(struct block_queue *bq, struct block_instructions *bi)
 122 {
 123    u_vector_finish(&bi->instructions);
 124    exec_node_remove(&bi->node);
 125    exec_list_push_head(&bq->reusable_blocks, &bi->node);
 126 }
 127
 128 static void
 129 add_instruction_for_block(struct block_instructions *bi,
 130                           nir_alu_instr *alu)
 131 {
 132    nir_alu_instr **data =
 133       u_vector_add(&bi->instructions);
 134
 135    *data = alu;
 136 }
 137
 138 static void
 139 rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
 140                             nir_alu_instr *orig_add, bool zero_on_left)
 141 {
 142    void *const mem_ctx = ralloc_parent(orig_cmp);
 143
 144    bld->cursor = nir_before_instr(&orig_cmp->instr);
 145
 146    /* This is somewhat tricky.  The compare instruction may be something like
 147     * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
 148     * b).  This is problematic because the SSA value for the fneg(a) may not
 149     * exist yet at the compare instruction.
 150     *
 151     * We fabricate the operands of the new add.  This is done using
 152     * information provided by zero_on_left.  If zero_on_left is true, we know
 153     * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
 154     * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
 155     * zero_on_left is false, the resulting compare instruction is (fcmp,
 156     * (fadd, x, y), 0.0) and x = a and y = -b.
 157     */
 158    nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
 159    nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
 160
 161    nir_ssa_def *const fadd = zero_on_left
 162       ? nir_fadd(bld, b, nir_fneg(bld, a))
 163       : nir_fadd(bld, a, nir_fneg(bld, b));
 164
 165    nir_ssa_def *const zero =
 166       nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
 167
 168    nir_ssa_def *const cmp = zero_on_left
 169       ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
 170       : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
 171
 172    /* Generating extra moves of the results is the easy way to make sure the
 173     * writemasks match the original instructions.  Later optimization passes
 174     * will clean these up.  This is similar to nir_replace_instr (in
 175     * nir_search.c).
 176     */
 177    nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_mov);
 178    mov_add->dest.write_mask = orig_add->dest.write_mask;
 179    nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
 180                      orig_add->dest.dest.ssa.num_components,
 181                      orig_add->dest.dest.ssa.bit_size, NULL);
 182    mov_add->src[0].src = nir_src_for_ssa(fadd);
 183
 184    nir_builder_instr_insert(bld, &mov_add->instr);
 185
 186    nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_mov);
 187    mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
 188    nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
 189                      orig_cmp->dest.dest.ssa.num_components,
 190                      orig_cmp->dest.dest.ssa.bit_size, NULL);
 191    mov_cmp->src[0].src = nir_src_for_ssa(cmp);
 192
 193    nir_builder_instr_insert(bld, &mov_cmp->instr);
 194
 195    nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
 196                             nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
 197    nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
 198                             nir_src_for_ssa(&mov_add->dest.dest.ssa));
 199
 200    /* We know these have no more uses because we just rewrote them all, so we
 201     * can remove them.
 202     */
 203    nir_instr_remove(&orig_cmp->instr);
 204    nir_instr_remove(&orig_add->instr);
 205 }
 206
 207 static bool
 208 comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
 209 {
 210    bool progress = false;
 211
 212    struct block_instructions *bi = push_block(bq);
 213    if (bi == NULL)
 214       return false;
 215
 216    /* Starting with the current block, examine each instruction.  If the
 217     * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
 218     * to the block_instructions::instructions set.  If the instruction is an
 219     * add instruction, walk up the block queue looking at the stored
 220     * instructions.  If a matching comparison is found, move the addition and
 221     * replace the comparison with a different comparison based on the result
 222     * of the addition.  All of the blocks in the queue are guaranteed to be
 223     * dominators of the current block.
 224     *
 225     * After processing the current block, recurse into the blocks dominated by
 226     * the current block.
 227     */
 228    nir_foreach_instr_safe(instr, block) {
 229       if (instr->type != nir_instr_type_alu)
 230          continue;
 231
 232       nir_alu_instr *const alu = nir_instr_as_alu(instr);
 233
 234       if (alu->dest.dest.ssa.num_components != 1)
 235          continue;
 236
 237       if (alu->dest.saturate)
 238          continue;
 239
 240       static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
 241
 242       switch (alu->op) {
 243       case nir_op_fadd: {
 244          /* If the instruction is fadd, check it against comparison
 245           * instructions that dominate it.
 246           */
 247          struct block_instructions *b =
 248             (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
 249
 250          while (b->node.next != NULL) {
 251             nir_alu_instr **a;
 252             bool rewrote_compare = false;
 253
 254             u_vector_foreach(a, &b->instructions) {
 255                nir_alu_instr *const cmp = *a;
 256
 257                if (cmp == NULL)
 258                   continue;
 259
 260                /* The operands of both instructions are, with some liberty,
 261                 * commutative.  Check all four permutations.  The third and
 262                 * fourth permutations are negations of the first two.
 263                 */
 264                if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
 265                     nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
 266                    (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
 267                     nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
 268                   /* These are the cases where (A cmp B) matches either (A +
 269                    * -B) or (-B + A)
 270                    *
 271                    *    A cmp B <=> A + -B cmp 0
 272                    */
 273                   rewrite_compare_instruction(bld, cmp, alu, false);
 274
 275                   *a = NULL;
 276                   rewrote_compare = true;
 277                   break;
 278                } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
 279                            nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
 280                           (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
 281                            nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
 282                   /* This is the case where (A cmp B) matches (B + -A) or (-A
 283                    * + B).
 284                    *
 285                    *    A cmp B <=> 0 cmp B + -A
 286                    */
 287                   rewrite_compare_instruction(bld, cmp, alu, true);
 288
 289                   *a = NULL;
 290                   rewrote_compare = true;
 291                   break;
 292                }
 293             }
 294
 295             /* Bail after a compare in the most dominating block is found.
 296              * This is necessary because 'alu' has been removed from the
 297              * instruction stream.  Should there be a matching compare in
 298              * another block, calling rewrite_compare_instruction again will
 299              * try to operate on a node that is not in the list as if it were
 300              * in the list.
 301              *
 302              * FINISHME: There may be opportunity for additional optimization
 303              * here.  I discovered this problem due to a shader in Guacamelee.
 304              * It may be possible to rewrite the matching compares that are
 305              * encountered later to reuse the result from the compare that was
 306              * first rewritten.  It's also possible that this is just taken
 307              * care of by calling the optimization pass repeatedly.
 308              */
 309             if (rewrote_compare) {
 310                progress = true;
 311                break;
 312             }
 313
 314             b = (struct block_instructions *) b->node.next;
 315          }
 316
 317          break;
 318       }
 319
 320       case nir_op_flt:
 321       case nir_op_fge:
 322       case nir_op_fne:
 323       case nir_op_feq:
 324          /* If the instruction is a comparison that is used by an if-statement
 325           * and neither operand is immediate value 0, add it to the set.
 326           */
 327          if (is_used_by_if(alu) &&
 328              is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
 329              is_not_const_zero(NULL, alu, 1, 1, swizzle))
 330             add_instruction_for_block(bi, alu);
 331
 332          break;
 333
 334       default:
 335          break;
 336       }
 337    }
 338
 339    for (unsigned i = 0; i < block->num_dom_children; i++) {
 340       nir_block *child = block->dom_children[i];
 341
 342       if (comparison_pre_block(child, bq, bld))
 343          progress = true;
 344    }
 345
 346    pop_block(bq, bi);
 347
 348    return progress;
 349 }
 350
 351 bool
 352 nir_opt_comparison_pre_impl(nir_function_impl *impl)
 353 {
 354    struct block_queue bq;
 355    nir_builder bld;
 356
 357    block_queue_init(&bq);
 358    nir_builder_init(&bld, impl);
 359
 360    nir_metadata_require(impl, nir_metadata_dominance);
 361
 362    const bool progress =
 363       comparison_pre_block(nir_start_block(impl), &bq, &bld);
 364
 365    block_queue_finish(&bq);
 366
 367    if (progress)
 368       nir_metadata_preserve(impl, nir_metadata_block_index |
 369                                   nir_metadata_dominance);
 370
 371    return progress;
 372 }
 373
 374 bool
 375 nir_opt_comparison_pre(nir_shader *shader)
 376 {
 377    bool progress = false;
 378
 379    nir_foreach_function(function, shader) {
 380       if (function->impl)
 381          progress |= nir_opt_comparison_pre_impl(function->impl);
 382    }
 383
 384    return progress;
 385 }