src/compiler/nir/nir_opt_comparison_pre.c

   1 /*
   2  * Copyright © 2018 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "nir_instr_set.h"
  25 #include "nir_search_helpers.h"
  26 #include "nir_builder.h"
  27 #include "util/u_vector.h"
  28
  29 /* Partial redundancy elimination of compares
  30  *
  31  * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
  32  * instructions like 'b - a'.  The comparison is replaced by the arithmetic
  33  * instruction, and the result is compared with zero.  For example,
  34  *
  35  *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
  36  *       if ssa_111 {
  37  *               block block_1:
  38  *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
  39  *              ...
  40  *
  41  * becomes
  42  *
  43  *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
  44  *       vec1 32 ssa_112 = flt 0.0, ssa_111
  45  *       if ssa_112 {
  46  *               block block_1:
  47  *              ...
  48  */
  49
  50 struct block_queue {
  51    /**
  52     * Stack of blocks from the current location in the CFG to the entry point
  53     * of the function.
  54     *
  55     * This is sort of a poor man's dominator tree.
  56     */
  57    struct exec_list blocks;
  58
  59    /** List of freed block_instructions structures that can be reused. */
  60    struct exec_list reusable_blocks;
  61 };
  62
  63 struct block_instructions {
  64    struct exec_node node;
  65
  66    /**
  67     * Set of comparison instructions from the block that are candidates for
  68     * being replaced by add instructions.
  69     */
  70    struct u_vector instructions;
  71 };
  72
  73 static void
  74 block_queue_init(struct block_queue *bq)
  75 {
  76    exec_list_make_empty(&bq->blocks);
  77    exec_list_make_empty(&bq->reusable_blocks);
  78 }
  79
  80 static void
  81 block_queue_finish(struct block_queue *bq)
  82 {
  83    struct block_instructions *n;
  84
  85    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
  86       u_vector_finish(&n->instructions);
  87       free(n);
  88    }
  89
  90    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
  91       free(n);
  92    }
  93 }
  94
  95 static struct block_instructions *
  96 push_block(struct block_queue *bq)
  97 {
  98    struct block_instructions *bi =
  99       (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
 100
 101    if (bi == NULL) {
 102       bi = calloc(1, sizeof(struct block_instructions));
 103
 104       if (bi == NULL)
 105          return NULL;
 106    }
 107
 108    if (!u_vector_init(&bi->instructions,
 109                       sizeof(struct nir_alu_instr *),
 110                       8 * sizeof(struct nir_alu_instr *)))
 111       return NULL;
 112
 113    exec_list_push_tail(&bq->blocks, &bi->node);
 114
 115    return bi;
 116 }
 117
 118 static void
 119 pop_block(struct block_queue *bq, struct block_instructions *bi)
 120 {
 121    u_vector_finish(&bi->instructions);
 122    exec_node_remove(&bi->node);
 123    exec_list_push_head(&bq->reusable_blocks, &bi->node);
 124 }
 125
 126 static void
 127 add_instruction_for_block(struct block_instructions *bi,
 128                           struct nir_alu_instr *alu)
 129 {
 130    struct nir_alu_instr **data =
 131       u_vector_add(&bi->instructions);
 132
 133    *data = alu;
 134 }
 135
 136 static void
 137 rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
 138                             nir_alu_instr *orig_add, bool zero_on_left)
 139 {
 140    void *const mem_ctx = ralloc_parent(orig_cmp);
 141
 142    bld->cursor = nir_before_instr(&orig_cmp->instr);
 143
 144    /* This is somewhat tricky.  The compare instruction may be something like
 145     * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
 146     * b).  This is problematic because the SSA value for the fneg(a) may not
 147     * exist yet at the compare instruction.
 148     *
 149     * We fabricate the operands of the new add.  This is done using
 150     * information provided by zero_on_left.  If zero_on_left is true, we know
 151     * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
 152     * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
 153     * zero_on_left is false, the resulting compare instruction is (fcmp,
 154     * (fadd, x, y), 0.0) and x = a and y = -b.
 155     */
 156    nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
 157    nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
 158
 159    nir_ssa_def *const fadd = zero_on_left
 160       ? nir_fadd(bld, b, nir_fneg(bld, a))
 161       : nir_fadd(bld, a, nir_fneg(bld, b));
 162
 163    nir_ssa_def *const zero =
 164       nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
 165
 166    nir_ssa_def *const cmp = zero_on_left
 167       ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
 168       : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
 169
 170    /* Generating extra moves of the results is the easy way to make sure the
 171     * writemasks match the original instructions.  Later optimization passes
 172     * will clean these up.  This is similar to nir_replace_instr (in
 173     * nir_search.c).
 174     */
 175    nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_imov);
 176    mov_add->dest.write_mask = orig_add->dest.write_mask;
 177    nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
 178                      orig_add->dest.dest.ssa.num_components,
 179                      orig_add->dest.dest.ssa.bit_size, NULL);
 180    mov_add->src[0].src = nir_src_for_ssa(fadd);
 181
 182    nir_builder_instr_insert(bld, &mov_add->instr);
 183
 184    nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_imov);
 185    mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
 186    nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
 187                      orig_cmp->dest.dest.ssa.num_components,
 188                      orig_cmp->dest.dest.ssa.bit_size, NULL);
 189    mov_cmp->src[0].src = nir_src_for_ssa(cmp);
 190
 191    nir_builder_instr_insert(bld, &mov_cmp->instr);
 192
 193    nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
 194                             nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
 195    nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
 196                             nir_src_for_ssa(&mov_add->dest.dest.ssa));
 197
 198    /* We know these have no more uses because we just rewrote them all, so we
 199     * can remove them.
 200     */
 201    nir_instr_remove(&orig_cmp->instr);
 202    nir_instr_remove(&orig_add->instr);
 203 }
 204
 205 static bool
 206 comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
 207 {
 208    bool progress = false;
 209
 210    struct block_instructions *bi = push_block(bq);
 211    if (bi == NULL)
 212       return false;
 213
 214    /* Starting with the current block, examine each instruction.  If the
 215     * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
 216     * to the block_instructions::instructions set.  If the instruction is an
 217     * add instruction, walk up the block queue looking at the stored
 218     * instructions.  If a matching comparison is found, move the addition and
 219     * replace the comparison with a different comparison based on the result
 220     * of the addition.  All of the blocks in the queue are guaranteed to be
 221     * dominators of the current block.
 222     *
 223     * After processing the current block, recurse into the blocks dominated by
 224     * the current block.
 225     */
 226    nir_foreach_instr_safe(instr, block) {
 227       if (instr->type != nir_instr_type_alu)
 228          continue;
 229
 230       struct nir_alu_instr *const alu = nir_instr_as_alu(instr);
 231
 232       if (alu->dest.dest.ssa.num_components != 1)
 233          continue;
 234
 235       if (alu->dest.saturate)
 236          continue;
 237
 238       static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
 239
 240       switch (alu->op) {
 241       case nir_op_fadd: {
 242          /* If the instruction is fadd, check it against comparison
 243           * instructions that dominate it.
 244           */
 245          struct block_instructions *b =
 246             (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
 247
 248          while (b->node.next != NULL) {
 249             nir_alu_instr **a;
 250             bool rewrote_compare = false;
 251
 252             u_vector_foreach(a, &b->instructions) {
 253                nir_alu_instr *const cmp = *a;
 254
 255                if (cmp == NULL)
 256                   continue;
 257
 258                /* The operands of both instructions are, with some liberty,
 259                 * commutative.  Check all four permutations.  The third and
 260                 * fourth permutations are negations of the first two.
 261                 */
 262                if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
 263                     nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
 264                    (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
 265                     nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
 266                   /* These are the cases where (A cmp B) matches either (A +
 267                    * -B) or (-B + A)
 268                    *
 269                    *    A cmp B <=> A + -B cmp 0
 270                    */
 271                   rewrite_compare_instruction(bld, cmp, alu, false);
 272
 273                   *a = NULL;
 274                   rewrote_compare = true;
 275                   break;
 276                } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
 277                            nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
 278                           (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
 279                            nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
 280                   /* This is the case where (A cmp B) matches (B + -A) or (-A
 281                    * + B).
 282                    *
 283                    *    A cmp B <=> 0 cmp B + -A
 284                    */
 285                   rewrite_compare_instruction(bld, cmp, alu, true);
 286
 287                   *a = NULL;
 288                   rewrote_compare = true;
 289                   break;
 290                }
 291             }
 292
 293             /* Bail after a compare in the most dominating block is found.
 294              * This is necessary because 'alu' has been removed from the
 295              * instruction stream.  Should there be a matching compare in
 296              * another block, calling rewrite_compare_instruction again will
 297              * try to operate on a node that is not in the list as if it were
 298              * in the list.
 299              *
 300              * FINISHME: There may be opportunity for additional optimization
 301              * here.  I discovered this problem due to a shader in Guacamelee.
 302              * It may be possible to rewrite the matching compares that are
 303              * encountered later to reuse the result from the compare that was
 304              * first rewritten.  It's also possible that this is just taken
 305              * care of by calling the optimization pass repeatedly.
 306              */
 307             if (rewrote_compare) {
 308                progress = true;
 309                break;
 310             }
 311
 312             b = (struct block_instructions *) b->node.next;
 313          }
 314
 315          break;
 316       }
 317
 318       case nir_op_flt:
 319       case nir_op_fge:
 320       case nir_op_fne:
 321       case nir_op_feq:
 322          /* If the instruction is a comparison that is used by an if-statement
 323           * and neither operand is immediate value 0, add it to the set.
 324           */
 325          if (is_used_by_if(alu) &&
 326              is_not_const_zero(alu, 0, 1, swizzle) &&
 327              is_not_const_zero(alu, 1, 1, swizzle))
 328             add_instruction_for_block(bi, alu);
 329
 330          break;
 331
 332       default:
 333          break;
 334       }
 335    }
 336
 337    for (unsigned i = 0; i < block->num_dom_children; i++) {
 338       nir_block *child = block->dom_children[i];
 339
 340       if (comparison_pre_block(child, bq, bld))
 341          progress = true;
 342    }
 343
 344    pop_block(bq, bi);
 345
 346    return progress;
 347 }
 348
 349 static bool
 350 nir_opt_comparison_pre_impl(nir_function_impl *impl)
 351 {
 352    struct block_queue bq;
 353    nir_builder bld;
 354
 355    block_queue_init(&bq);
 356    nir_builder_init(&bld, impl);
 357
 358    nir_metadata_require(impl, nir_metadata_dominance);
 359
 360    const bool progress =
 361       comparison_pre_block(nir_start_block(impl), &bq, &bld);
 362
 363    block_queue_finish(&bq);
 364
 365    if (progress)
 366       nir_metadata_preserve(impl, nir_metadata_block_index |
 367                                   nir_metadata_dominance);
 368
 369    return progress;
 370 }
 371
 372 bool
 373 nir_opt_comparison_pre(nir_shader *shader)
 374 {
 375    bool progress = false;
 376
 377    nir_foreach_function(function, shader) {
 378       if (function->impl)
 379          progress |= nir_opt_comparison_pre_impl(function->impl);
 380    }
 381
 382    return progress;
 383 }