src/panfrost/bifrost/bi_lower_combine.c

   1 /*
   2  * Copyright (C) 2020 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25
  26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
  27  * BI_COMBINE instruction, e.g.:
  28  *
  29  *      v = combine x, y, z, w
  30  *
  31  * These combines need to be lowered by the pass in this file. Fix a given
  32  * source at component c.
  33  *
  34  * First suppose the source is SSA. If it is also scalar, then we may rewrite
  35  * the destination of the generating instruction (unique by SSA+scalar) to
  36  * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
  37  * (the original by scalar). If it is vector, there are two cases. If the
  38  * component c is `x`, we are accessing v.x, and each of the succeeding
  39  * components y, z... up to the last component of the vector are accessed
  40  * sequentially, then we may perform the same rewrite. If this is not the case,
  41  * rewriting would require more complex vector features, so we fallback on a
  42  * move.
  43  *
  44  * Otherwise is the source is not SSA, we also fallback on a move. We could
  45  * probably do better.
  46  */
  47
  48 static void
  49 bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
  50 {
  51         bi_instruction move = {
  52                 .type = BI_MOV,
  53                 .dest = R,
  54                 .dest_type = nir_type_uint32,
  55                 .dest_offset = comp,
  56                 .src = { parent->src[comp] },
  57                 .src_types = { nir_type_uint32 },
  58                 .swizzle = { { parent->swizzle[comp][0] } }
  59         };
  60
  61         bi_emit_before(ctx, parent, move);
  62 }
  63
  64 static void
  65 bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
  66 {
  67         bi_instruction sel = {
  68                 .type = BI_SELECT,
  69                 .dest = R,
  70                 .dest_type = nir_type_uint32,
  71                 .dest_offset = comp >> 1,
  72                 .src = { parent->src[comp], parent->src[comp + 1] },
  73                 .src_types = { nir_type_uint16, nir_type_uint16 },
  74                 .swizzle = { {
  75                         parent->swizzle[comp][0],
  76                         parent->swizzle[comp + 1][0],
  77                 } }
  78         };
  79
  80         bi_emit_before(ctx, parent, sel);
  81 }
  82
  83 /* Gets the instruction generating a given source. Combine lowering is
  84  * accidentally O(n^2) right now because this function is O(n) instead of O(1).
  85  * If this pass is slow, this cost can be avoided in favour for better
  86  * bookkeeping. */
  87
  88 static bi_instruction *
  89 bi_get_parent(bi_context *ctx, unsigned idx)
  90 {
  91         bi_foreach_instr_global(ctx, ins) {
  92                 if (ins->dest == idx)
  93                         return ins;
  94         }
  95
  96         return NULL;
  97 }
  98
  99 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
 100  * currently O(nc) to the program and number of combines, so the pass becomes
 101  * effectively O(n^2). Better bookkeeping would bring down to linear if that's
 102  * an issue. */
 103
 104 static void
 105 bi_rewrite_uses(bi_context *ctx,
 106                 unsigned old, unsigned oldc,
 107                 unsigned new, unsigned newc)
 108 {
 109         bi_foreach_instr_global(ctx, ins) {
 110                 bi_foreach_src(ins, s) {
 111                         if (ins->src[s] != old) continue;
 112
 113                         for (unsigned i = 0; i < 16; ++i)
 114                                 ins->swizzle[s][i] += (newc - oldc);
 115
 116                         ins->src[s] = new;
 117                 }
 118         }
 119 }
 120
 121 /* Checks if we have a nicely aligned vector prefix */
 122
 123 static bool
 124 bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
 125                 unsigned *count)
 126 {
 127         /* We only support prefixes */
 128         if (s != 0)
 129                 return false;
 130
 131         if (!(bi_class_props[io->type] & BI_VECTOR))
 132                 return false;
 133
 134         if (nir_alu_type_get_type_size(combine->dest_type) != 32)
 135                 return false;
 136
 137         if (nir_alu_type_get_type_size(io->dest_type) != 32)
 138                 return false;
 139
 140         unsigned components = io->vector_channels;
 141
 142         /* Are we contiguous like that? */
 143
 144         for (unsigned i = 0; i < components; ++i) {
 145                 if (combine->src[i] != io->dest)
 146                         return false;
 147
 148                 if (combine->swizzle[i][0] != i)
 149                         return false;
 150         }
 151
 152         /* We're good to go */
 153         *count = components;
 154         return true;
 155 }
 156
 157 #if 0
 158 /* Tries to lower a given source of a combine to an appropriate rewrite,
 159  * returning true if successful, and false with no changes otherwise. */
 160
 161 static bool
 162 bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
 163                 unsigned *vec_count)
 164 {
 165         unsigned src = ins->src[s];
 166
 167         /* We currently only handle SSA */
 168
 169         if (!src) return false;
 170         if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
 171
 172         /* We are SSA. Lookup the generating instruction. */
 173         unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
 174
 175         bi_instruction *parent = bi_get_parent(ctx, src,
 176                          0xF << (ins->swizzle[s][0] * bytes));
 177
 178         if (!parent) return false;
 179
 180         /* We have a parent instuction, sanity check the typesize */
 181         unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
 182         if (pbytes != bytes) return false;
 183
 184         bool scalar = parent->vector_channels != 0;
 185         if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
 186
 187         if (!bi_shift_mask(parent, bytes * s)) return false;
 188         bi_rewrite_uses(ctx, parent->dest, 0, R, s);
 189         parent->dest = R;
 190         return true;
 191 }
 192 #endif
 193
 194 void
 195 bi_lower_combine(bi_context *ctx, bi_block *block)
 196 {
 197         bi_foreach_instr_in_block_safe(block, ins) {
 198                 if (ins->type != BI_COMBINE) continue;
 199
 200                 unsigned R = bi_make_temp_reg(ctx);
 201                 unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
 202
 203                 bi_foreach_src(ins, s) {
 204                         /* We're done early for vec2/3 */
 205                         if (!ins->src[s])
 206                                 continue;
 207
 208 #if 0
 209                         unsigned vec_count = 0;
 210
 211                         if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
 212                                 /* Skip vectored sources */
 213                                 if (vec_count)
 214                                         s += (vec_count - 1);
 215                         } else {
 216                                 bi_insert_combine_mov(ctx, ins, s, R);
 217                         }
 218 #endif
 219                         if (sz == 32)
 220                                 bi_combine_mov32(ctx, ins, s, R);
 221                         else if (sz == 16) {
 222                                 bi_combine_sel16(ctx, ins, s, R);
 223                                 s++;
 224                         } else {
 225                                 unreachable("Unknown COMBINE size");
 226                         }
 227                 }
 228
 229
 230                 bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
 231                 bi_remove_instruction(ins);
 232         }
 233 }