src/gallium/drivers/panfrost/midgard/midgard_ra.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/register_allocate.h"
  27 #include "util/u_math.h"
  28
  29 /* For work registers, we can subdivide in various ways. So we create
  30  * classes for the various sizes and conflict accordingly, keeping in
  31  * mind that physical registers are divided along 128-bit boundaries.
  32  * The important part is that 128-bit boundaries are not crossed.
  33  *
  34  * For each 128-bit register, we can subdivide to 32-bits 10 ways
  35  *
  36  * vec4: xyzw
  37  * vec3: xyz, yzw
  38  * vec2: xy, yz, zw,
  39  * vec1: x, y, z, w
  40  *
  41  * For each 64-bit register, we can subdivide similarly to 16-bit
  42  * (TODO: half-float RA, not that we support fp16 yet)
  43  */
  44
  45 #define WORK_STRIDE 10
  46
  47 /* Prepacked masks/swizzles for virtual register types */
  48 static unsigned reg_type_to_mask[WORK_STRIDE] = {
  49         0xF,                                    /* xyzw */
  50         0x7, 0x7 << 1,                          /* xyz */
  51         0x3, 0x3 << 1, 0x3 << 2,                /* xy */
  52         0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3       /* x */
  53 };
  54
  55 static unsigned reg_type_to_swizzle[WORK_STRIDE] = {
  56         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  57
  58         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  59         SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W),
  60
  61         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  62         SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W),
  63         SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W),
  64
  65         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  66         SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  67         SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  68         SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  69 };
  70
  71 struct phys_reg {
  72         unsigned reg;
  73         unsigned mask;
  74         unsigned swizzle;
  75 };
  76
  77 /* Given the mask/swizzle of both the register and the original source,
  78  * compose to find the actual mask/swizzle to give the hardware */
  79
  80 static unsigned
  81 compose_writemask(unsigned mask, struct phys_reg reg)
  82 {
  83         /* Note: the reg mask is guaranteed to be contiguous. So we shift
  84          * into the X place, compose via a simple AND, and shift back */
  85
  86         unsigned shift = __builtin_ctz(reg.mask);
  87         return ((reg.mask >> shift) & mask) << shift;
  88 }
  89
  90 static unsigned
  91 compose_swizzle(unsigned swizzle, unsigned mask, struct phys_reg reg, struct phys_reg dst)
  92 {
  93         unsigned out = 0;
  94
  95         for (unsigned c = 0; c < 4; ++c) {
  96                 unsigned s = (swizzle >> (2*c)) & 0x3;
  97                 unsigned q = (reg.swizzle >> (2*s)) & 0x3;
  98
  99                 out |= (q << (2*c));
 100         }
 101
 102         /* Based on the register mask, we need to adjust over. E.g if we're
 103          * writing to yz, a base swizzle of xy__ becomes _xy_. Save the
 104          * original first component (x). But to prevent duplicate shifting
 105          * (only applies to ALU -- mask param is set to xyzw out on L/S to
 106          * prevent changes), we have to account for the shift inherent to the
 107          * original writemask */
 108
 109         unsigned rep = out & 0x3;
 110         unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask);
 111         unsigned shifted = out << (2*shift);
 112
 113         /* ..but we fill in the gaps so it appears to replicate */
 114
 115         for (unsigned s = 0; s < shift; ++s)
 116                 shifted |= rep << (2*s);
 117
 118         return shifted;
 119 }
 120
 121 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 122  * as such */
 123
 124 static unsigned
 125 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 126 {
 127         if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
 128                 return hash;
 129
 130         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);
 131
 132         if (temp)
 133                 return temp - 1;
 134
 135         /* If no temp is find, allocate one */
 136         temp = ctx->temp_count++;
 137         ctx->max_hash = MAX2(ctx->max_hash, hash);
 138
 139         _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));
 140
 141         return temp;
 142 }
 143
 144 /* Callback for register allocation selection, trivial default for now */
 145
 146 static unsigned int
 147 midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
 148 {
 149         /* Choose the first available register to minimise reported register pressure */
 150
 151         for (int i = 0; i < (16 * WORK_STRIDE); ++i) {
 152                 if (BITSET_TEST(regs, i)) {
 153                         return i;
 154                 }
 155         }
 156
 157         assert(0);
 158         return 0;
 159 }
 160
 161 /* Helper to return the default phys_reg for a given register */
 162
 163 static struct phys_reg
 164 default_phys_reg(int reg)
 165 {
 166         struct phys_reg r = {
 167                 .reg = reg,
 168                 .mask = 0xF, /* xyzw */
 169                 .swizzle = 0xE4 /* xyzw */
 170         };
 171
 172         return r;
 173 }
 174
 175 /* Determine which physical register, swizzle, and mask a virtual
 176  * register corresponds to */
 177
 178 static struct phys_reg
 179 index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
 180 {
 181         /* Check for special cases */
 182         if (reg >= SSA_FIXED_MINIMUM)
 183                 return default_phys_reg(SSA_REG_FROM_FIXED(reg));
 184         else if ((reg < 0) || !g)
 185                 return default_phys_reg(REGISTER_UNUSED);
 186
 187         /* Special cases aside, we pick the underlying register */
 188         int virt = ra_get_node_reg(g, reg);
 189
 190         /* Divide out the register and classification */
 191         int phys = virt / WORK_STRIDE;
 192         int type = virt % WORK_STRIDE;
 193
 194         struct phys_reg r = {
 195                 .reg = phys,
 196                 .mask = reg_type_to_mask[type],
 197                 .swizzle = reg_type_to_swizzle[type]
 198         };
 199
 200         /* Report that we actually use this register, and return it */
 201         ctx->work_registers = MAX2(ctx->work_registers, phys);
 202         return r;
 203 }
 204
 205 /* This routine performs the actual register allocation. It should be succeeded
 206  * by install_registers */
 207
 208 struct ra_graph *
 209 allocate_registers(compiler_context *ctx)
 210 {
 211         /* The number of vec4 work registers available depends on when the
 212          * uniforms start, so compute that first */
 213
 214         int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
 215
 216         int virtual_count = work_count * WORK_STRIDE;
 217
 218         /* First, initialize the RA */
 219         struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
 220
 221         int work_vec4 = ra_alloc_reg_class(regs);
 222         int work_vec3 = ra_alloc_reg_class(regs);
 223         int work_vec2 = ra_alloc_reg_class(regs);
 224         int work_vec1 = ra_alloc_reg_class(regs);
 225
 226         unsigned classes[4] = {
 227                 work_vec1,
 228                 work_vec2,
 229                 work_vec3,
 230                 work_vec4
 231         };
 232
 233         /* Add the full set of work registers */
 234         for (int i = 0; i < work_count; ++i) {
 235                 int base = WORK_STRIDE * i;
 236
 237                 /* Build a full set of subdivisions */
 238                 ra_class_add_reg(regs, work_vec4, base);
 239                 ra_class_add_reg(regs, work_vec3, base + 1);
 240                 ra_class_add_reg(regs, work_vec3, base + 2);
 241                 ra_class_add_reg(regs, work_vec2, base + 3);
 242                 ra_class_add_reg(regs, work_vec2, base + 4);
 243                 ra_class_add_reg(regs, work_vec2, base + 5);
 244                 ra_class_add_reg(regs, work_vec1, base + 6);
 245                 ra_class_add_reg(regs, work_vec1, base + 7);
 246                 ra_class_add_reg(regs, work_vec1, base + 8);
 247                 ra_class_add_reg(regs, work_vec1, base + 9);
 248
 249                 for (unsigned i = 0; i < 10; ++i) {
 250                         for (unsigned j = 0; j < 10; ++j) {
 251                                 unsigned mask1 = reg_type_to_mask[i];
 252                                 unsigned mask2 = reg_type_to_mask[j];
 253
 254                                 if (mask1 & mask2)
 255                                         ra_add_reg_conflict(regs, base + i, base + j);
 256                         }
 257                 }
 258         }
 259
 260         /* We're done setting up */
 261         ra_set_finalize(regs, NULL);
 262
 263         /* Transform the MIR into squeezed index form */
 264         mir_foreach_block(ctx, block) {
 265                 mir_foreach_instr_in_block(block, ins) {
 266                         if (ins->compact_branch) continue;
 267
 268                         ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
 269                         ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
 270
 271                         if (!ins->ssa_args.inline_constant)
 272                                 ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
 273
 274                 }
 275         }
 276
 277         /* No register allocation to do with no SSA */
 278
 279         if (!ctx->temp_count)
 280                 return NULL;
 281
 282         /* Let's actually do register allocation */
 283         int nodes = ctx->temp_count;
 284         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
 285
 286         /* Determine minimum size needed to hold values, to indirectly
 287          * determine class */
 288
 289         unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
 290
 291         mir_foreach_block(ctx, block) {
 292                 mir_foreach_instr_in_block(block, ins) {
 293                         if (ins->compact_branch) continue;
 294                         if (ins->ssa_args.dest < 0) continue;
 295                         if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
 296
 297                         /* Default to vec4 if we're not sure */
 298
 299                         int mask = 0xF;
 300
 301                         if (ins->type == TAG_ALU_4)
 302                                 mask = squeeze_writemask(ins->alu.mask);
 303                         else if (ins->type == TAG_LOAD_STORE_4)
 304                                 mask = ins->load_store.mask;
 305
 306                         int class = util_logbase2(mask) + 1;
 307
 308                         /* Use the largest class if there's ambiguity, this
 309                          * handles partial writes */
 310
 311                         int dest = ins->ssa_args.dest;
 312                         found_class[dest] = MAX2(found_class[dest], class);
 313                 }
 314         }
 315
 316         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 317                 unsigned class = found_class[i];
 318                 if (!class) continue;
 319                 ra_set_node_class(g, i, classes[class - 1]);
 320         }
 321
 322         /* Determine liveness */
 323
 324         int *live_start = malloc(nodes * sizeof(int));
 325         int *live_end = malloc(nodes * sizeof(int));
 326
 327         /* Initialize as non-existent */
 328
 329         for (int i = 0; i < nodes; ++i) {
 330                 live_start[i] = live_end[i] = -1;
 331         }
 332
 333         int d = 0;
 334
 335         mir_foreach_block(ctx, block) {
 336                 mir_foreach_instr_in_block(block, ins) {
 337                         if (ins->compact_branch) continue;
 338
 339                         /* Dest is < 0 for st_vary instructions, which break
 340                          * the usual SSA conventions. Liveness analysis doesn't
 341                          * make sense on these instructions, so skip them to
 342                          * avoid memory corruption */
 343
 344                         if (ins->ssa_args.dest < 0) continue;
 345
 346                         if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
 347                                 /* If this destination is not yet live, it is now since we just wrote it */
 348
 349                                 int dest = ins->ssa_args.dest;
 350
 351                                 if (live_start[dest] == -1)
 352                                         live_start[dest] = d;
 353                         }
 354
 355                         /* Since we just used a source, the source might be
 356                          * dead now. Scan the rest of the block for
 357                          * invocations, and if there are none, the source dies
 358                          * */
 359
 360                         int sources[2] = { ins->ssa_args.src0, ins->ssa_args.src1 };
 361
 362                         for (int src = 0; src < 2; ++src) {
 363                                 int s = sources[src];
 364
 365                                 if (s < 0) continue;
 366
 367                                 if (s >= SSA_FIXED_MINIMUM) continue;
 368
 369                                 if (!mir_is_live_after(ctx, block, ins, s)) {
 370                                         live_end[s] = d;
 371                                 }
 372                         }
 373
 374                         ++d;
 375                 }
 376         }
 377
 378         /* If a node still hasn't been killed, kill it now */
 379
 380         for (int i = 0; i < nodes; ++i) {
 381                 /* live_start == -1 most likely indicates a pinned output */
 382
 383                 if (live_end[i] == -1)
 384                         live_end[i] = d;
 385         }
 386
 387         /* Setup interference between nodes that are live at the same time */
 388
 389         for (int i = 0; i < nodes; ++i) {
 390                 for (int j = i + 1; j < nodes; ++j) {
 391                         if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i]))
 392                                 ra_add_node_interference(g, i, j);
 393                 }
 394         }
 395
 396         ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
 397
 398         if (!ra_allocate(g)) {
 399                 unreachable("Error allocating registers\n");
 400         }
 401
 402         /* Cleanup */
 403         free(live_start);
 404         free(live_end);
 405
 406         return g;
 407 }
 408
 409 /* Once registers have been decided via register allocation
 410  * (allocate_registers), we need to rewrite the MIR to use registers instead of
 411  * indices */
 412
 413 static void
 414 install_registers_instr(
 415                 compiler_context *ctx,
 416                 struct ra_graph *g,
 417                 midgard_instruction *ins)
 418 {
 419         ssa_args args = ins->ssa_args;
 420
 421         switch (ins->type) {
 422         case TAG_ALU_4: {
 423                 int adjusted_src = args.inline_constant ? -1 : args.src1;
 424                 struct phys_reg src1 = index_to_reg(ctx, g, args.src0);
 425                 struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src);
 426                 struct phys_reg dest = index_to_reg(ctx, g, args.dest);
 427
 428                 unsigned mask = squeeze_writemask(ins->alu.mask);
 429                 ins->alu.mask = expand_writemask(compose_writemask(mask, dest));
 430
 431                 /* Adjust the dest mask if necessary. Mostly this is a no-op
 432                  * but it matters for dot products */
 433                 dest.mask = effective_writemask(&ins->alu);
 434
 435                 midgard_vector_alu_src mod1 =
 436                         vector_alu_from_unsigned(ins->alu.src1);
 437                 mod1.swizzle = compose_swizzle(mod1.swizzle, mask, src1, dest);
 438                 ins->alu.src1 = vector_alu_srco_unsigned(mod1);
 439
 440                 ins->registers.src1_reg = src1.reg;
 441
 442                 ins->registers.src2_imm = args.inline_constant;
 443
 444                 if (args.inline_constant) {
 445                         /* Encode inline 16-bit constant as a vector by default */
 446
 447                         ins->registers.src2_reg = ins->inline_constant >> 11;
 448
 449                         int lower_11 = ins->inline_constant & ((1 << 12) - 1);
 450
 451                         uint16_t imm = ((lower_11 >> 8) & 0x7) | ((lower_11 & 0xFF) << 3);
 452                         ins->alu.src2 = imm << 2;
 453                 } else {
 454                         midgard_vector_alu_src mod2 =
 455                                 vector_alu_from_unsigned(ins->alu.src2);
 456                         mod2.swizzle = compose_swizzle(mod2.swizzle, mask, src2, dest);
 457                         ins->alu.src2 = vector_alu_srco_unsigned(mod2);
 458
 459                         ins->registers.src2_reg = src2.reg;
 460                 }
 461
 462                 ins->registers.out_reg = dest.reg;
 463                 break;
 464         }
 465
 466         case TAG_LOAD_STORE_4: {
 467                 if (OP_IS_STORE(ins->load_store.op)) {
 468                         /* TODO: use ssa_args for st_vary */
 469                         ins->load_store.reg = 0;
 470                 } else {
 471                         struct phys_reg src = index_to_reg(ctx, g, args.dest);
 472
 473                         ins->load_store.reg = src.reg;
 474
 475                         ins->load_store.swizzle = compose_swizzle(
 476                                         ins->load_store.swizzle, 0xF,
 477                                         default_phys_reg(0), src);
 478
 479                         ins->load_store.mask = compose_writemask(
 480                                         ins->load_store.mask, src);
 481                 }
 482
 483                 break;
 484         }
 485
 486         default:
 487                 break;
 488         }
 489 }
 490
 491 void
 492 install_registers(compiler_context *ctx, struct ra_graph *g)
 493 {
 494         mir_foreach_block(ctx, block) {
 495                 mir_foreach_instr_in_block(block, ins) {
 496                         if (ins->compact_branch) continue;
 497                         install_registers_instr(ctx, g, ins);
 498                 }
 499         }
 500
 501 }