src/gallium/drivers/panfrost/midgard/midgard_ra.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/register_allocate.h"
  27 #include "util/u_math.h"
  28
  29 /* For work registers, we can subdivide in various ways. So we create
  30  * classes for the various sizes and conflict accordingly, keeping in
  31  * mind that physical registers are divided along 128-bit boundaries.
  32  * The important part is that 128-bit boundaries are not crossed.
  33  *
  34  * For each 128-bit register, we can subdivide to 32-bits 10 ways
  35  *
  36  * vec4: xyzw
  37  * vec3: xyz, yzw
  38  * vec2: xy, yz, zw,
  39  * vec1: x, y, z, w
  40  *
  41  * For each 64-bit register, we can subdivide similarly to 16-bit
  42  * (TODO: half-float RA, not that we support fp16 yet)
  43  */
  44
  45 #define WORK_STRIDE 10
  46
  47 /* Prepacked masks/swizzles for virtual register types */
  48 static unsigned reg_type_to_mask[WORK_STRIDE] = {
  49         0xF,                                    /* xyzw */
  50         0x7, 0x7 << 1,                          /* xyz */
  51         0x3, 0x3 << 1, 0x3 << 2,                /* xy */
  52         0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3       /* x */
  53 };
  54
  55 static unsigned reg_type_to_swizzle[WORK_STRIDE] = {
  56         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  57
  58         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  59         SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_W, COMPONENT_W),
  60
  61         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  62         SWIZZLE(COMPONENT_Y, COMPONENT_Z, COMPONENT_Z, COMPONENT_W),
  63         SWIZZLE(COMPONENT_Z, COMPONENT_W, COMPONENT_Z, COMPONENT_W),
  64
  65         SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  66         SWIZZLE(COMPONENT_Y, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  67         SWIZZLE(COMPONENT_Z, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  68         SWIZZLE(COMPONENT_W, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
  69 };
  70
  71 struct phys_reg {
  72         unsigned reg;
  73         unsigned mask;
  74         unsigned swizzle;
  75 };
  76
  77 /* Given the mask/swizzle of both the register and the original source,
  78  * compose to find the actual mask/swizzle to give the hardware */
  79
  80 static unsigned
  81 compose_writemask(unsigned mask, struct phys_reg reg)
  82 {
  83         /* Note: the reg mask is guaranteed to be contiguous. So we shift
  84          * into the X place, compose via a simple AND, and shift back */
  85
  86         unsigned shift = __builtin_ctz(reg.mask);
  87         return ((reg.mask >> shift) & mask) << shift;
  88 }
  89
  90 static unsigned
  91 compose_swizzle(unsigned swizzle, unsigned mask,
  92                 struct phys_reg reg, struct phys_reg dst)
  93 {
  94         unsigned out = 0;
  95
  96         for (unsigned c = 0; c < 4; ++c) {
  97                 unsigned s = (swizzle >> (2*c)) & 0x3;
  98                 unsigned q = (reg.swizzle >> (2*s)) & 0x3;
  99
 100                 out |= (q << (2*c));
 101         }
 102
 103         /* Based on the register mask, we need to adjust over. E.g if we're
 104          * writing to yz, a base swizzle of xy__ becomes _xy_. Save the
 105          * original first component (x). But to prevent duplicate shifting
 106          * (only applies to ALU -- mask param is set to xyzw out on L/S to
 107          * prevent changes), we have to account for the shift inherent to the
 108          * original writemask */
 109
 110         unsigned rep = out & 0x3;
 111         unsigned shift = __builtin_ctz(dst.mask) - __builtin_ctz(mask);
 112         unsigned shifted = out << (2*shift);
 113
 114         /* ..but we fill in the gaps so it appears to replicate */
 115
 116         for (unsigned s = 0; s < shift; ++s)
 117                 shifted |= rep << (2*s);
 118
 119         return shifted;
 120 }
 121
 122 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 123  * as such */
 124
 125 static unsigned
 126 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 127 {
 128         if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
 129                 return hash;
 130
 131         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
 132                         ctx->hash_to_temp, hash + 1);
 133
 134         if (temp)
 135                 return temp - 1;
 136
 137         /* If no temp is find, allocate one */
 138         temp = ctx->temp_count++;
 139         ctx->max_hash = MAX2(ctx->max_hash, hash);
 140
 141         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
 142                         hash + 1, (void *) ((uintptr_t) temp + 1));
 143
 144         return temp;
 145 }
 146
 147 /* Callback for register allocation selection, trivial default for now */
 148
 149 static unsigned int
 150 midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
 151 {
 152         /* Choose the first available register to minimise register pressure */
 153
 154         for (int i = 0; i < (16 * WORK_STRIDE); ++i) {
 155                 if (BITSET_TEST(regs, i)) {
 156                         return i;
 157                 }
 158         }
 159
 160         assert(0);
 161         return 0;
 162 }
 163
 164 /* Helper to return the default phys_reg for a given register */
 165
 166 static struct phys_reg
 167 default_phys_reg(int reg)
 168 {
 169         struct phys_reg r = {
 170                 .reg = reg,
 171                 .mask = 0xF, /* xyzw */
 172                 .swizzle = 0xE4 /* xyzw */
 173         };
 174
 175         return r;
 176 }
 177
 178 /* Determine which physical register, swizzle, and mask a virtual
 179  * register corresponds to */
 180
 181 static struct phys_reg
 182 index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
 183 {
 184         /* Check for special cases */
 185         if (reg >= SSA_FIXED_MINIMUM)
 186                 return default_phys_reg(SSA_REG_FROM_FIXED(reg));
 187         else if ((reg < 0) || !g)
 188                 return default_phys_reg(REGISTER_UNUSED);
 189
 190         /* Special cases aside, we pick the underlying register */
 191         int virt = ra_get_node_reg(g, reg);
 192
 193         /* Divide out the register and classification */
 194         int phys = virt / WORK_STRIDE;
 195         int type = virt % WORK_STRIDE;
 196
 197         struct phys_reg r = {
 198                 .reg = phys,
 199                 .mask = reg_type_to_mask[type],
 200                 .swizzle = reg_type_to_swizzle[type]
 201         };
 202
 203         /* Report that we actually use this register, and return it */
 204         ctx->work_registers = MAX2(ctx->work_registers, phys);
 205         return r;
 206 }
 207
 208 /* This routine performs the actual register allocation. It should be succeeded
 209  * by install_registers */
 210
 211 struct ra_graph *
 212 allocate_registers(compiler_context *ctx)
 213 {
 214         /* The number of vec4 work registers available depends on when the
 215          * uniforms start, so compute that first */
 216
 217         int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
 218
 219         int virtual_count = work_count * WORK_STRIDE;
 220
 221         /* First, initialize the RA */
 222         struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
 223
 224         int work_vec4 = ra_alloc_reg_class(regs);
 225         int work_vec3 = ra_alloc_reg_class(regs);
 226         int work_vec2 = ra_alloc_reg_class(regs);
 227         int work_vec1 = ra_alloc_reg_class(regs);
 228
 229         unsigned classes[4] = {
 230                 work_vec1,
 231                 work_vec2,
 232                 work_vec3,
 233                 work_vec4
 234         };
 235
 236         /* Add the full set of work registers */
 237         for (unsigned i = 0; i < work_count; ++i) {
 238                 int base = WORK_STRIDE * i;
 239
 240                 /* Build a full set of subdivisions */
 241                 ra_class_add_reg(regs, work_vec4, base);
 242                 ra_class_add_reg(regs, work_vec3, base + 1);
 243                 ra_class_add_reg(regs, work_vec3, base + 2);
 244                 ra_class_add_reg(regs, work_vec2, base + 3);
 245                 ra_class_add_reg(regs, work_vec2, base + 4);
 246                 ra_class_add_reg(regs, work_vec2, base + 5);
 247                 ra_class_add_reg(regs, work_vec1, base + 6);
 248                 ra_class_add_reg(regs, work_vec1, base + 7);
 249                 ra_class_add_reg(regs, work_vec1, base + 8);
 250                 ra_class_add_reg(regs, work_vec1, base + 9);
 251
 252                 for (unsigned a = 0; a < 10; ++a) {
 253                         unsigned mask1 = reg_type_to_mask[a];
 254
 255                         for (unsigned b = 0; b < 10; ++b) {
 256                                 unsigned mask2 = reg_type_to_mask[b];
 257
 258                                 if (mask1 & mask2)
 259                                         ra_add_reg_conflict(regs,
 260                                                         base + a, base + b);
 261                         }
 262                 }
 263         }
 264
 265         /* We're done setting up */
 266         ra_set_finalize(regs, NULL);
 267
 268         /* Transform the MIR into squeezed index form */
 269         mir_foreach_block(ctx, block) {
 270                 mir_foreach_instr_in_block(block, ins) {
 271                         if (ins->compact_branch) continue;
 272
 273                         ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
 274                         ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
 275
 276                         if (!ins->ssa_args.inline_constant)
 277                                 ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
 278
 279                 }
 280         }
 281
 282         /* No register allocation to do with no SSA */
 283
 284         if (!ctx->temp_count)
 285                 return NULL;
 286
 287         /* Let's actually do register allocation */
 288         int nodes = ctx->temp_count;
 289         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
 290
 291         /* Determine minimum size needed to hold values, to indirectly
 292          * determine class */
 293
 294         unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
 295
 296         mir_foreach_block(ctx, block) {
 297                 mir_foreach_instr_in_block(block, ins) {
 298                         if (ins->compact_branch) continue;
 299                         if (ins->ssa_args.dest < 0) continue;
 300                         if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
 301
 302                         /* Default to vec4 if we're not sure */
 303
 304                         int mask = 0xF;
 305
 306                         if (ins->type == TAG_ALU_4)
 307                                 mask = squeeze_writemask(ins->alu.mask);
 308                         else if (ins->type == TAG_LOAD_STORE_4)
 309                                 mask = ins->load_store.mask;
 310
 311                         int class = util_logbase2(mask) + 1;
 312
 313                         /* Use the largest class if there's ambiguity, this
 314                          * handles partial writes */
 315
 316                         int dest = ins->ssa_args.dest;
 317                         found_class[dest] = MAX2(found_class[dest], class);
 318                 }
 319         }
 320
 321         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 322                 unsigned class = found_class[i];
 323                 if (!class) continue;
 324                 ra_set_node_class(g, i, classes[class - 1]);
 325         }
 326
 327         /* Determine liveness */
 328
 329         int *live_start = malloc(nodes * sizeof(int));
 330         int *live_end = malloc(nodes * sizeof(int));
 331
 332         /* Initialize as non-existent */
 333
 334         for (int i = 0; i < nodes; ++i) {
 335                 live_start[i] = live_end[i] = -1;
 336         }
 337
 338         int d = 0;
 339
 340         mir_foreach_block(ctx, block) {
 341                 mir_foreach_instr_in_block(block, ins) {
 342                         if (ins->compact_branch) continue;
 343
 344                         /* Dest is < 0 for st_vary instructions, which break
 345                          * the usual SSA conventions. Liveness analysis doesn't
 346                          * make sense on these instructions, so skip them to
 347                          * avoid memory corruption */
 348
 349                         if (ins->ssa_args.dest < 0) continue;
 350
 351                         if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
 352                                 /* If this destination is not yet live, it is
 353                                  * now since we just wrote it */
 354
 355                                 int dest = ins->ssa_args.dest;
 356
 357                                 if (live_start[dest] == -1)
 358                                         live_start[dest] = d;
 359                         }
 360
 361                         /* Since we just used a source, the source might be
 362                          * dead now. Scan the rest of the block for
 363                          * invocations, and if there are none, the source dies
 364                          * */
 365
 366                         int sources[2] = {
 367                                 ins->ssa_args.src0, ins->ssa_args.src1
 368                         };
 369
 370                         for (int src = 0; src < 2; ++src) {
 371                                 int s = sources[src];
 372
 373                                 if (s < 0) continue;
 374
 375                                 if (s >= SSA_FIXED_MINIMUM) continue;
 376
 377                                 if (!mir_is_live_after(ctx, block, ins, s)) {
 378                                         live_end[s] = d;
 379                                 }
 380                         }
 381
 382                         ++d;
 383                 }
 384         }
 385
 386         /* If a node still hasn't been killed, kill it now */
 387
 388         for (int i = 0; i < nodes; ++i) {
 389                 /* live_start == -1 most likely indicates a pinned output */
 390
 391                 if (live_end[i] == -1)
 392                         live_end[i] = d;
 393         }
 394
 395         /* Setup interference between nodes that are live at the same time */
 396
 397         for (int i = 0; i < nodes; ++i) {
 398                 for (int j = i + 1; j < nodes; ++j) {
 399                         bool j_overlaps_i = live_start[j] < live_end[i];
 400                         bool i_overlaps_j = live_end[j] < live_start[i];
 401
 402                         if (i_overlaps_j || j_overlaps_i)
 403                                 ra_add_node_interference(g, i, j);
 404                 }
 405         }
 406
 407         ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
 408
 409         if (!ra_allocate(g)) {
 410                 unreachable("Error allocating registers\n");
 411         }
 412
 413         /* Cleanup */
 414         free(live_start);
 415         free(live_end);
 416
 417         return g;
 418 }
 419
 420 /* Once registers have been decided via register allocation
 421  * (allocate_registers), we need to rewrite the MIR to use registers instead of
 422  * indices */
 423
 424 static void
 425 install_registers_instr(
 426                 compiler_context *ctx,
 427                 struct ra_graph *g,
 428                 midgard_instruction *ins)
 429 {
 430         ssa_args args = ins->ssa_args;
 431
 432         switch (ins->type) {
 433         case TAG_ALU_4: {
 434                 int adjusted_src = args.inline_constant ? -1 : args.src1;
 435                 struct phys_reg src1 = index_to_reg(ctx, g, args.src0);
 436                 struct phys_reg src2 = index_to_reg(ctx, g, adjusted_src);
 437                 struct phys_reg dest = index_to_reg(ctx, g, args.dest);
 438
 439                 unsigned mask = squeeze_writemask(ins->alu.mask);
 440                 ins->alu.mask = expand_writemask(compose_writemask(mask, dest));
 441
 442                 /* Adjust the dest mask if necessary. Mostly this is a no-op
 443                  * but it matters for dot products */
 444                 dest.mask = effective_writemask(&ins->alu);
 445
 446                 midgard_vector_alu_src mod1 =
 447                         vector_alu_from_unsigned(ins->alu.src1);
 448                 mod1.swizzle = compose_swizzle(mod1.swizzle, mask, src1, dest);
 449                 ins->alu.src1 = vector_alu_srco_unsigned(mod1);
 450
 451                 ins->registers.src1_reg = src1.reg;
 452
 453                 ins->registers.src2_imm = args.inline_constant;
 454
 455                 if (args.inline_constant) {
 456                         /* Encode inline 16-bit constant. See disassembler for
 457                          * where the algorithm is from */
 458
 459                         ins->registers.src2_reg = ins->inline_constant >> 11;
 460
 461                         int lower_11 = ins->inline_constant & ((1 << 12) - 1);
 462                         uint16_t imm = ((lower_11 >> 8) & 0x7) |
 463                                 ((lower_11 & 0xFF) << 3);
 464
 465                         ins->alu.src2 = imm << 2;
 466                 } else {
 467                         midgard_vector_alu_src mod2 =
 468                                 vector_alu_from_unsigned(ins->alu.src2);
 469                         mod2.swizzle = compose_swizzle(
 470                                         mod2.swizzle, mask, src2, dest);
 471                         ins->alu.src2 = vector_alu_srco_unsigned(mod2);
 472
 473                         ins->registers.src2_reg = src2.reg;
 474                 }
 475
 476                 ins->registers.out_reg = dest.reg;
 477                 break;
 478         }
 479
 480         case TAG_LOAD_STORE_4: {
 481                 if (OP_IS_STORE(ins->load_store.op)) {
 482                         /* TODO: use ssa_args for st_vary */
 483                         ins->load_store.reg = 0;
 484                 } else {
 485                         struct phys_reg src = index_to_reg(ctx, g, args.dest);
 486
 487                         ins->load_store.reg = src.reg;
 488
 489                         ins->load_store.swizzle = compose_swizzle(
 490                                         ins->load_store.swizzle, 0xF,
 491                                         default_phys_reg(0), src);
 492
 493                         ins->load_store.mask = compose_writemask(
 494                                         ins->load_store.mask, src);
 495                 }
 496
 497                 break;
 498         }
 499
 500         default:
 501                 break;
 502         }
 503 }
 504
 505 void
 506 install_registers(compiler_context *ctx, struct ra_graph *g)
 507 {
 508         mir_foreach_block(ctx, block) {
 509                 mir_foreach_instr_in_block(block, ins) {
 510                         if (ins->compact_branch) continue;
 511                         install_registers_instr(ctx, g, ins);
 512                 }
 513         }
 514
 515 }