src/panfrost/bifrost/bi_lower_combine.c

   1 /*
   2  * Copyright (C) 2020 Collabora, Ltd.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25
  26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
  27  * BI_COMBINE instruction, e.g.:
  28  *
  29  *      v = combine x, y, z, w
  30  *
  31  * These combines need to be lowered by the pass in this file. Fix a given
  32  * source at component c.
  33  *
  34  * First suppose the source is SSA. If it is also scalar, then we may rewrite
  35  * the destination of the generating instruction (unique by SSA+scalar) to
  36  * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
  37  * (the original by scalar). If it is vector, there are two cases. If the
  38  * component c is `x`, we are accessing v.x, and each of the succeeding
  39  * components y, z... up to the last component of the vector are accessed
  40  * sequentially, then we may perform the same rewrite. If this is not the case,
  41  * rewriting would require a swizzle or writemask (TODO), so we fallback on a
  42  * move.
  43  *
  44  * Otherwise is the source is not SSA, we also fallback on a move. We could
  45  * probably do better.
  46  */
  47
  48 static void
  49 bi_insert_combine_mov(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
  50 {
  51         unsigned bits = nir_alu_type_get_type_size(parent->dest_type);
  52         unsigned bytes = bits / 8;
  53
  54         bi_instruction move = {
  55                 .type = BI_MOV,
  56                 .dest = R,
  57                 .dest_type = parent->dest_type,
  58                 .writemask = ((1 << bytes) - 1) << (bytes * comp),
  59                 .src = { parent->src[comp] },
  60                 .src_types = { parent->dest_type },
  61                 .swizzle = { { parent->swizzle[comp][0] } }
  62         };
  63
  64         bi_emit_before(ctx, parent, move);
  65 }
  66
  67 /* Gets the instruction generating a given source. Combine lowering is
  68  * accidentally O(n^2) right now because this function is O(n) instead of O(1).
  69  * If this pass is slow, this cost can be avoided in favour for better
  70  * bookkeeping. */
  71
  72 static bi_instruction *
  73 bi_get_parent(bi_context *ctx, unsigned idx, unsigned mask)
  74 {
  75         bi_foreach_instr_global(ctx, ins) {
  76                 if (ins->dest == idx)
  77                         if ((ins->writemask & mask) == mask)
  78                                 return ins;
  79         }
  80
  81         return NULL;
  82 }
  83
  84 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
  85  * currently O(nc) to the program and number of combines, so the pass becomes
  86  * effectively O(n^2). Better bookkeeping would bring down to linear if that's
  87  * an issue. */
  88
  89 static void
  90 bi_rewrite_uses(bi_context *ctx,
  91                 unsigned old, unsigned oldc,
  92                 unsigned new, unsigned newc)
  93 {
  94         bi_foreach_instr_global(ctx, ins) {
  95                 bi_foreach_src(ins, s) {
  96                         if (ins->src[s] != old) continue;
  97
  98                         for (unsigned i = 0; i < 16; ++i)
  99                                 ins->swizzle[s][i] += (newc - oldc);
 100
 101                         ins->src[s] = new;
 102                 }
 103         }
 104 }
 105
 106 /* Shifts the writemask of an instruction by a specified byte count,
 107  * rotating the sources to compensate. Returns true if successful, and
 108  * returns false if not (nondestructive in this case). */
 109
 110 static bool
 111 bi_shift_mask(bi_instruction *ins, unsigned shift)
 112 {
 113         /* No op and handles the funny cases */
 114         if (!shift)
 115                 return true;
 116
 117         unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
 118         unsigned bytes = sz / 8;
 119
 120         /* If things are misaligned, we bail. Check if shift % bytes is
 121          * nonzero. Note bytes is a power-of-two. */
 122         if (shift & (bytes - 1))
 123                 return false;
 124
 125         /* Ensure there are no funny types */
 126         bi_foreach_src(ins, s) {
 127                 if (ins->src[s] && nir_alu_type_get_type_size(ins->src_types[s]) != sz)
 128                         return false;
 129         }
 130
 131         /* Shift swizzle so old i'th component is accessed by new (i + j)'th
 132          * component where j is component shift */
 133         unsigned component_shift = shift / bytes;
 134
 135         /* Sanity check to avoid memory corruption */
 136         if (component_shift >= sizeof(ins->swizzle[0]))
 137                 return false;
 138
 139         /* Otherwise, shift is divisible by bytes, and all relevant src types
 140          * are the same size as the dest type. */
 141         ins->writemask <<= shift;
 142
 143         bi_foreach_src(ins, s) {
 144                 if (!ins->src[s]) continue;
 145
 146                 size_t overlap = sizeof(ins->swizzle[s]) - component_shift;
 147                 memmove(ins->swizzle[s] + component_shift, ins->swizzle[s], overlap);
 148         }
 149
 150         return true;
 151 }
 152
 153 /* Checks if we have a nicely aligned vector prefix */
 154
 155 static bool
 156 bi_is_aligned_vec(bi_instruction *combine, unsigned s, bi_instruction *parent,
 157                 unsigned *count)
 158 {
 159         /* We only support prefixes */
 160         if (s != 0)
 161                 return false;
 162
 163         /* Is it a contiguous write? */
 164         unsigned writes = util_bitcount(parent->writemask);
 165         if (parent->writemask != ((1 << writes) - 1))
 166                 return false;
 167
 168         /* Okay - how many components? */
 169         unsigned bytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
 170         unsigned components = writes / bytes;
 171
 172         /* Are we contiguous like that? */
 173
 174         for (unsigned i = 0; i < components; ++i) {
 175                 if (combine->src[i] != parent->dest)
 176                         return false;
 177
 178                 if (combine->swizzle[i][0] != i)
 179                         return false;
 180         }
 181
 182         /* We're good to go */
 183         *count = components;
 184         return true;
 185 }
 186
 187 /* Tries to lower a given source of a combine to an appropriate rewrite,
 188  * returning true if successful, and false with no changes otherwise. */
 189
 190 static bool
 191 bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
 192                 unsigned *vec_count)
 193 {
 194         unsigned src = ins->src[s];
 195
 196         /* We currently only handle SSA */
 197
 198         if (!src) return false;
 199         if (src & (BIR_SPECIAL | BIR_IS_REG)) return false;
 200
 201         /* We are SSA. Lookup the generating instruction. */
 202         unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
 203
 204         bi_instruction *parent = bi_get_parent(ctx, src,
 205                          0xF << (ins->swizzle[s][0] * bytes));
 206
 207         if (!parent) return false;
 208
 209         /* We have a parent instuction, sanity check the typesize */
 210         unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
 211         if (pbytes != bytes) return false;
 212
 213         bool scalar = (parent->writemask == ((1 << bytes) - 1));
 214         if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
 215
 216         if (!bi_shift_mask(parent, bytes * s)) return false;
 217         bi_rewrite_uses(ctx, parent->dest, 0, R, s);
 218         parent->dest = R;
 219         return true;
 220 }
 221
 222 void
 223 bi_lower_combine(bi_context *ctx, bi_block *block)
 224 {
 225         bi_foreach_instr_in_block_safe(block, ins) {
 226                 if (ins->type != BI_COMBINE) continue;
 227
 228                 /* The vector itself can't be shifted */
 229                 assert(ins->writemask & 0x1);
 230
 231                 unsigned R = bi_make_temp_reg(ctx);
 232
 233                 bi_foreach_src(ins, s) {
 234                         /* We're done early for vec2/3 */
 235                         if (!ins->src[s])
 236                                 continue;
 237
 238                         unsigned vec_count = 0;
 239
 240                         if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
 241                                 /* Skip vectored sources */
 242                                 if (vec_count)
 243                                         s += (vec_count - 1);
 244                         } else {
 245                                 bi_insert_combine_mov(ctx, ins, s, R);
 246                         }
 247                 }
 248
 249
 250                 bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
 251                 bi_remove_instruction(ins);
 252         }
 253 }