src/panfrost/midgard/midgard_ra.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  * Copyright (C) 2019 Collabora, Ltd.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "compiler.h"
  26 #include "midgard_ops.h"
  27 #include "util/register_allocate.h"
  28 #include "util/u_math.h"
  29 #include "util/u_memory.h"
  30 #include "lcra.h"
  31
  32 /* For work registers, we can subdivide in various ways. So we create
  33  * classes for the various sizes and conflict accordingly, keeping in
  34  * mind that physical registers are divided along 128-bit boundaries.
  35  * The important part is that 128-bit boundaries are not crossed.
  36  *
  37  * For each 128-bit register, we can subdivide to 32-bits 10 ways
  38  *
  39  * vec4: xyzw
  40  * vec3: xyz, yzw
  41  * vec2: xy, yz, zw,
  42  * vec1: x, y, z, w
  43  *
  44  * For each 64-bit register, we can subdivide similarly to 16-bit
  45  * (TODO: half-float RA, not that we support fp16 yet)
  46  */
  47
  48 #define WORK_STRIDE 10
  49
  50 /* We have overlapping register classes for special registers, handled via
  51  * shadows */
  52
  53 #define SHADOW_R0  17
  54 #define SHADOW_R28 18
  55 #define SHADOW_R29 19
  56
  57 /* Prepacked masks/swizzles for virtual register types */
  58 static unsigned reg_type_to_mask[WORK_STRIDE] = {
  59         0xF,                                    /* xyzw */
  60         0x7, 0x7 << 1,                          /* xyz */
  61                  0x3, 0x3 << 1, 0x3 << 2,                /* xy */
  62                  0x1, 0x1 << 1, 0x1 << 2, 0x1 << 3       /* x */
  63 };
  64
  65 struct phys_reg {
  66         /* Physical register: 0-31 */
  67         unsigned reg;
  68
  69         /* Byte offset into the physical register: 0-15 */
  70         unsigned offset;
  71
  72         /* Number of bytes in a component of this register */
  73         unsigned size;
  74 };
  75
  76 /* Shift up by reg_offset and horizontally by dst_offset. */
  77
  78 static void
  79 offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcsize, unsigned dst_offset)
  80 {
  81         unsigned out[MIR_VEC_COMPONENTS];
  82
  83         signed reg_comp = reg_offset / srcsize;
  84         signed dst_comp = dst_offset / srcsize;
  85
  86         unsigned max_component = (16 / srcsize) - 1;
  87
  88         assert(reg_comp * srcsize == reg_offset);
  89         assert(dst_comp * srcsize == dst_offset);
  90
  91         for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) {
  92                 signed comp = MAX2(c - dst_comp, 0);
  93                 out[c] = MIN2(swizzle[comp] + reg_comp, max_component);
  94         }
  95
  96         memcpy(swizzle, out, sizeof(out));
  97 }
  98
  99 /* Helper to return the default phys_reg for a given register */
 100
 101 static struct phys_reg
 102 default_phys_reg(int reg, midgard_reg_mode size)
 103 {
 104         struct phys_reg r = {
 105                 .reg = reg,
 106                 .offset = 0,
 107                 .size = mir_bytes_for_mode(size)
 108         };
 109
 110         return r;
 111 }
 112
 113 /* Determine which physical register, swizzle, and mask a virtual
 114  * register corresponds to */
 115
 116 static struct phys_reg
 117 index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg, midgard_reg_mode size)
 118 {
 119         /* Check for special cases */
 120         if (reg == ~0)
 121                 return default_phys_reg(REGISTER_UNUSED, size);
 122         else if (reg >= SSA_FIXED_MINIMUM)
 123                 return default_phys_reg(SSA_REG_FROM_FIXED(reg), size);
 124         else if (!l)
 125                 return default_phys_reg(REGISTER_UNUSED, size);
 126
 127         struct phys_reg r = {
 128                 .reg = l->solutions[reg] / 16,
 129                 .offset = l->solutions[reg] & 0xF,
 130                 .size = mir_bytes_for_mode(size)
 131         };
 132
 133         /* Report that we actually use this register, and return it */
 134
 135         if (r.reg < 16)
 136                 ctx->work_registers = MAX2(ctx->work_registers, r.reg);
 137
 138         return r;
 139 }
 140
 141 /* This routine creates a register set. Should be called infrequently since
 142  * it's slow and can be cached. For legibility, variables are named in terms of
 143  * work registers, although it is also used to create the register set for
 144  * special register allocation */
 145
 146 static void
 147 add_shadow_conflicts (struct ra_regs *regs, unsigned base, unsigned shadow, unsigned shadow_count)
 148 {
 149         for (unsigned a = 0; a < WORK_STRIDE; ++a) {
 150                 unsigned reg_a = (WORK_STRIDE * base) + a;
 151
 152                 for (unsigned b = 0; b < shadow_count; ++b) {
 153                         unsigned reg_b = (WORK_STRIDE * shadow) + b;
 154
 155                         ra_add_reg_conflict(regs, reg_a, reg_b);
 156                         ra_add_reg_conflict(regs, reg_b, reg_a);
 157                 }
 158         }
 159 }
 160
 161 static struct ra_regs *
 162 create_register_set(unsigned work_count, unsigned *classes)
 163 {
 164         int virtual_count = 32 * WORK_STRIDE;
 165
 166         /* First, initialize the RA */
 167         struct ra_regs *regs = ra_alloc_reg_set(NULL, virtual_count, true);
 168
 169         for (unsigned c = 0; c < (NR_REG_CLASSES - 1); ++c) {
 170                 int work_vec4 = ra_alloc_reg_class(regs);
 171                 int work_vec3 = ra_alloc_reg_class(regs);
 172                 int work_vec2 = ra_alloc_reg_class(regs);
 173                 int work_vec1 = ra_alloc_reg_class(regs);
 174
 175                 classes[4*c + 0] = work_vec1;
 176                 classes[4*c + 1] = work_vec2;
 177                 classes[4*c + 2] = work_vec3;
 178                 classes[4*c + 3] = work_vec4;
 179
 180                 /* Special register classes have other register counts */
 181                 unsigned count =
 182                         (c == REG_CLASS_WORK)   ? work_count : 2;
 183
 184                 unsigned first_reg =
 185                         (c == REG_CLASS_LDST)   ? 26 :
 186                         (c == REG_CLASS_TEXR)   ? 28 :
 187                         (c == REG_CLASS_TEXW)   ? SHADOW_R28 :
 188                         0;
 189
 190                 /* Add the full set of work registers */
 191                 for (unsigned i = first_reg; i < (first_reg + count); ++i) {
 192                         int base = WORK_STRIDE * i;
 193
 194                         /* Build a full set of subdivisions */
 195                         ra_class_add_reg(regs, work_vec4, base);
 196                         ra_class_add_reg(regs, work_vec3, base + 1);
 197                         ra_class_add_reg(regs, work_vec3, base + 2);
 198                         ra_class_add_reg(regs, work_vec2, base + 3);
 199                         ra_class_add_reg(regs, work_vec2, base + 4);
 200                         ra_class_add_reg(regs, work_vec2, base + 5);
 201                         ra_class_add_reg(regs, work_vec1, base + 6);
 202                         ra_class_add_reg(regs, work_vec1, base + 7);
 203                         ra_class_add_reg(regs, work_vec1, base + 8);
 204                         ra_class_add_reg(regs, work_vec1, base + 9);
 205
 206                         for (unsigned a = 0; a < 10; ++a) {
 207                                 unsigned mask1 = reg_type_to_mask[a];
 208
 209                                 for (unsigned b = 0; b < 10; ++b) {
 210                                         unsigned mask2 = reg_type_to_mask[b];
 211
 212                                         if (mask1 & mask2)
 213                                                 ra_add_reg_conflict(regs,
 214                                                                     base + a, base + b);
 215                                 }
 216                         }
 217                 }
 218         }
 219
 220         int fragc = ra_alloc_reg_class(regs);
 221
 222         classes[4*REG_CLASS_FRAGC + 0] = fragc;
 223         classes[4*REG_CLASS_FRAGC + 1] = fragc;
 224         classes[4*REG_CLASS_FRAGC + 2] = fragc;
 225         classes[4*REG_CLASS_FRAGC + 3] = fragc;
 226         ra_class_add_reg(regs, fragc, WORK_STRIDE * SHADOW_R0);
 227
 228         /* We have duplicate classes */
 229         add_shadow_conflicts(regs,  0, SHADOW_R0,  1);
 230         add_shadow_conflicts(regs, 28, SHADOW_R28, WORK_STRIDE);
 231         add_shadow_conflicts(regs, 29, SHADOW_R29, WORK_STRIDE);
 232
 233         /* We're done setting up */
 234         ra_set_finalize(regs, NULL);
 235
 236         return regs;
 237 }
 238
 239 /* This routine gets a precomputed register set off the screen if it's able, or
 240  * otherwise it computes one on the fly */
 241
 242 static struct ra_regs *
 243 get_register_set(struct midgard_screen *screen, unsigned work_count, unsigned **classes)
 244 {
 245         /* Bounds check */
 246         assert(work_count >= 8);
 247         assert(work_count <= 16);
 248
 249         /* Compute index */
 250         unsigned index = work_count - 8;
 251
 252         /* Find the reg set */
 253         struct ra_regs *cached = screen->regs[index];
 254
 255         if (cached) {
 256                 assert(screen->reg_classes[index]);
 257                 *classes = screen->reg_classes[index];
 258                 return cached;
 259         }
 260
 261         /* Otherwise, create one */
 262         struct ra_regs *created = create_register_set(work_count, screen->reg_classes[index]);
 263
 264         /* Cache it and use it */
 265         screen->regs[index] = created;
 266
 267         *classes = screen->reg_classes[index];
 268         return created;
 269 }
 270
 271 /* Assign a (special) class, ensuring that it is compatible with whatever class
 272  * was already set */
 273
 274 static void
 275 set_class(unsigned *classes, unsigned node, unsigned class)
 276 {
 277         /* Check that we're even a node */
 278         if (node >= SSA_FIXED_MINIMUM)
 279                 return;
 280
 281         /* First 4 are work, next 4 are load/store.. */
 282         unsigned current_class = classes[node] >> 2;
 283
 284         /* Nothing to do */
 285         if (class == current_class)
 286                 return;
 287
 288         /* If we're changing, we haven't assigned a special class */
 289         assert(current_class == REG_CLASS_WORK);
 290
 291         classes[node] &= 0x3;
 292         classes[node] |= (class << 2);
 293 }
 294
 295 static void
 296 force_vec4(unsigned *classes, unsigned node)
 297 {
 298         if (node >= SSA_FIXED_MINIMUM)
 299                 return;
 300
 301         /* Force vec4 = 3 */
 302         classes[node] |= 0x3;
 303 }
 304
 305 /* Special register classes impose special constraints on who can read their
 306  * values, so check that */
 307
 308 static bool
 309 check_read_class(unsigned *classes, unsigned tag, unsigned node)
 310 {
 311         /* Non-nodes are implicitly ok */
 312         if (node >= SSA_FIXED_MINIMUM)
 313                 return true;
 314
 315         unsigned current_class = classes[node] >> 2;
 316
 317         switch (current_class) {
 318         case REG_CLASS_LDST:
 319                 return (tag == TAG_LOAD_STORE_4);
 320         case REG_CLASS_TEXR:
 321                 return (tag == TAG_TEXTURE_4);
 322         case REG_CLASS_TEXW:
 323                 return (tag != TAG_LOAD_STORE_4);
 324         case REG_CLASS_WORK:
 325                 return IS_ALU(tag);
 326         default:
 327                 unreachable("Invalid class");
 328         }
 329 }
 330
 331 static bool
 332 check_write_class(unsigned *classes, unsigned tag, unsigned node)
 333 {
 334         /* Non-nodes are implicitly ok */
 335         if (node >= SSA_FIXED_MINIMUM)
 336                 return true;
 337
 338         unsigned current_class = classes[node] >> 2;
 339
 340         switch (current_class) {
 341         case REG_CLASS_TEXR:
 342                 return true;
 343         case REG_CLASS_TEXW:
 344                 return (tag == TAG_TEXTURE_4);
 345         case REG_CLASS_LDST:
 346         case REG_CLASS_WORK:
 347                 return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4);
 348         default:
 349                 unreachable("Invalid class");
 350         }
 351 }
 352
 353 /* Prepass before RA to ensure special class restrictions are met. The idea is
 354  * to create a bit field of types of instructions that read a particular index.
 355  * Later, we'll add moves as appropriate and rewrite to specialize by type. */
 356
 357 static void
 358 mark_node_class (unsigned *bitfield, unsigned node)
 359 {
 360         if (node < SSA_FIXED_MINIMUM)
 361                 BITSET_SET(bitfield, node);
 362 }
 363
 364 void
 365 mir_lower_special_reads(compiler_context *ctx)
 366 {
 367         size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD);
 368
 369         /* Bitfields for the various types of registers we could have. aluw can
 370          * be written by either ALU or load/store */
 371
 372         unsigned *alur = calloc(sz, 1);
 373         unsigned *aluw = calloc(sz, 1);
 374         unsigned *brar = calloc(sz, 1);
 375         unsigned *ldst = calloc(sz, 1);
 376         unsigned *texr = calloc(sz, 1);
 377         unsigned *texw = calloc(sz, 1);
 378
 379         /* Pass #1 is analysis, a linear scan to fill out the bitfields */
 380
 381         mir_foreach_instr_global(ctx, ins) {
 382                 switch (ins->type) {
 383                 case TAG_ALU_4:
 384                         mark_node_class(aluw, ins->dest);
 385                         mark_node_class(alur, ins->src[0]);
 386                         mark_node_class(alur, ins->src[1]);
 387                         mark_node_class(alur, ins->src[2]);
 388
 389                         if (ins->compact_branch && ins->writeout)
 390                                 mark_node_class(brar, ins->src[0]);
 391
 392                         break;
 393
 394                 case TAG_LOAD_STORE_4:
 395                         mark_node_class(aluw, ins->dest);
 396                         mark_node_class(ldst, ins->src[0]);
 397                         mark_node_class(ldst, ins->src[1]);
 398                         mark_node_class(ldst, ins->src[2]);
 399                         break;
 400
 401                 case TAG_TEXTURE_4:
 402                         mark_node_class(texr, ins->src[0]);
 403                         mark_node_class(texr, ins->src[1]);
 404                         mark_node_class(texr, ins->src[2]);
 405                         mark_node_class(texw, ins->dest);
 406                         break;
 407                 }
 408         }
 409
 410         /* Pass #2 is lowering now that we've analyzed all the classes.
 411          * Conceptually, if an index is only marked for a single type of use,
 412          * there is nothing to lower. If it is marked for different uses, we
 413          * split up based on the number of types of uses. To do so, we divide
 414          * into N distinct classes of use (where N>1 by definition), emit N-1
 415          * moves from the index to copies of the index, and finally rewrite N-1
 416          * of the types of uses to use the corresponding move */
 417
 418         unsigned spill_idx = ctx->temp_count;
 419
 420         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 421                 bool is_alur = BITSET_TEST(alur, i);
 422                 bool is_aluw = BITSET_TEST(aluw, i);
 423                 bool is_brar = BITSET_TEST(brar, i);
 424                 bool is_ldst = BITSET_TEST(ldst, i);
 425                 bool is_texr = BITSET_TEST(texr, i);
 426                 bool is_texw = BITSET_TEST(texw, i);
 427
 428                 /* Analyse to check how many distinct uses there are. ALU ops
 429                  * (alur) can read the results of the texture pipeline (texw)
 430                  * but not ldst or texr. Load/store ops (ldst) cannot read
 431                  * anything but load/store inputs. Texture pipeline cannot read
 432                  * anything but texture inputs. TODO: Simplify.  */
 433
 434                 bool collision =
 435                         (is_alur && (is_ldst || is_texr)) ||
 436                         (is_ldst && (is_alur || is_texr || is_texw)) ||
 437                         (is_texr && (is_alur || is_ldst || is_texw)) ||
 438                         (is_texw && (is_aluw || is_ldst || is_texr)) ||
 439                         (is_brar && is_texw);
 440
 441                 if (!collision)
 442                         continue;
 443
 444                 /* Use the index as-is as the work copy. Emit copies for
 445                  * special uses */
 446
 447                 unsigned classes[] = { TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4, TAG_ALU_4};
 448                 bool collisions[] = { is_ldst, is_texr, is_texw && is_aluw, is_brar };
 449
 450                 for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) {
 451                         if (!collisions[j]) continue;
 452
 453                         /* When the hazard is from reading, we move and rewrite
 454                          * sources (typical case). When it's from writing, we
 455                          * flip the move and rewrite destinations (obscure,
 456                          * only from control flow -- impossible in SSA) */
 457
 458                         bool hazard_write = (j == 2);
 459
 460                         unsigned idx = spill_idx++;
 461
 462                         midgard_instruction m = hazard_write ?
 463                                 v_mov(idx, i) : v_mov(i, idx);
 464
 465                         /* Insert move before each read/write, depending on the
 466                          * hazard we're trying to account for */
 467
 468                         mir_foreach_instr_global_safe(ctx, pre_use) {
 469                                 if (pre_use->type != classes[j])
 470                                         continue;
 471
 472                                 if (hazard_write) {
 473                                         if (pre_use->dest != i)
 474                                                 continue;
 475                                 } else {
 476                                         if (!mir_has_arg(pre_use, i))
 477                                                 continue;
 478                                 }
 479
 480                                 if (hazard_write) {
 481                                         midgard_instruction *use = mir_next_op(pre_use);
 482                                         assert(use);
 483                                         mir_insert_instruction_before(ctx, use, m);
 484                                         mir_rewrite_index_dst_single(pre_use, i, idx);
 485                                 } else {
 486                                         idx = spill_idx++;
 487                                         m = v_mov(i, idx);
 488                                         m.mask = mir_from_bytemask(mir_bytemask_of_read_components(pre_use, i), midgard_reg_mode_32);
 489                                         mir_insert_instruction_before(ctx, pre_use, m);
 490                                         mir_rewrite_index_src_single(pre_use, i, idx);
 491                                 }
 492                         }
 493                 }
 494         }
 495
 496         free(alur);
 497         free(aluw);
 498         free(brar);
 499         free(ldst);
 500         free(texr);
 501         free(texw);
 502 }
 503
 504 /* We register allocate after scheduling, so we need to ensure instructions
 505  * executing in parallel within a segment of a bundle don't clobber each
 506  * other's registers. This is mostly a non-issue thanks to scheduling, but
 507  * there are edge cases. In particular, after a register is written in a
 508  * segment, it interferes with anything reading. */
 509
 510 static void
 511 mir_compute_segment_interference(
 512                 compiler_context *ctx,
 513                 struct lcra_state *l,
 514                 midgard_bundle *bun,
 515                 unsigned pivot,
 516                 unsigned i)
 517 {
 518         for (unsigned j = pivot; j < i; ++j) {
 519                 mir_foreach_src(bun->instructions[j], s) {
 520                         if (bun->instructions[j]->src[s] >= ctx->temp_count)
 521                                 continue;
 522
 523                         for (unsigned q = pivot; q < i; ++q) {
 524                                 if (bun->instructions[q]->dest >= ctx->temp_count)
 525                                         continue;
 526
 527                                 /* See dEQP-GLES2.functional.shaders.return.output_write_in_func_dynamic_fragment */
 528
 529                                 if (q >= j) {
 530                                         if (!(bun->instructions[j]->unit == UNIT_SMUL && bun->instructions[q]->unit == UNIT_VLUT))
 531                                                 continue;
 532                                 }
 533
 534                                 unsigned mask = mir_bytemask(bun->instructions[q]);
 535                                 unsigned rmask = mir_bytemask_of_read_components(bun->instructions[j], bun->instructions[j]->src[s]);
 536                                 lcra_add_node_interference(l, bun->instructions[q]->dest, mask, bun->instructions[j]->src[s], rmask);
 537                         }
 538                 }
 539         }
 540 }
 541
 542 static void
 543 mir_compute_bundle_interference(
 544                 compiler_context *ctx,
 545                 struct lcra_state *l,
 546                 midgard_bundle *bun)
 547 {
 548         if (!IS_ALU(bun->tag))
 549                 return;
 550
 551         bool old = bun->instructions[0]->unit >= UNIT_VADD;
 552         unsigned pivot = 0;
 553
 554         for (unsigned i = 1; i < bun->instruction_count; ++i) {
 555                 bool new = bun->instructions[i]->unit >= UNIT_VADD;
 556
 557                 if (old != new) {
 558                         mir_compute_segment_interference(ctx, l, bun, 0, i);
 559                         pivot = i;
 560                         break;
 561                 }
 562         }
 563
 564         mir_compute_segment_interference(ctx, l, bun, pivot, bun->instruction_count);
 565 }
 566
 567 static void
 568 mir_compute_interference(
 569                 compiler_context *ctx,
 570                 struct ra_graph *g,
 571                 struct lcra_state *l)
 572 {
 573         /* First, we need liveness information to be computed per block */
 574         mir_compute_liveness(ctx);
 575
 576         /* Now that every block has live_in/live_out computed, we can determine
 577          * interference by walking each block linearly. Take live_out at the
 578          * end of each block and walk the block backwards. */
 579
 580         mir_foreach_block(ctx, blk) {
 581                 uint16_t *live = mem_dup(blk->live_out, ctx->temp_count * sizeof(uint16_t));
 582
 583                 mir_foreach_instr_in_block_rev(blk, ins) {
 584                         /* Mark all registers live after the instruction as
 585                          * interfering with the destination */
 586
 587                         unsigned dest = ins->dest;
 588
 589                         if (dest < ctx->temp_count) {
 590                                 for (unsigned i = 0; i < ctx->temp_count; ++i)
 591                                         if (live[i]) {
 592                                                 unsigned mask = mir_bytemask(ins);
 593                                                 lcra_add_node_interference(l, dest, mask, i, live[i]);
 594                                         }
 595                         }
 596
 597                         /* Update live_in */
 598                         mir_liveness_ins_update(live, ins, ctx->temp_count);
 599                 }
 600
 601                 mir_foreach_bundle_in_block(blk, bun)
 602                         mir_compute_bundle_interference(ctx, l, bun);
 603
 604                 free(live);
 605         }
 606 }
 607
 608 /* This routine performs the actual register allocation. It should be succeeded
 609  * by install_registers */
 610
 611 struct lcra_state *
 612 allocate_registers(compiler_context *ctx, bool *spilled)
 613 {
 614         /* The number of vec4 work registers available depends on when the
 615          * uniforms start, so compute that first */
 616         int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
 617         unsigned *classes = NULL;
 618         struct ra_regs *regs = get_register_set(ctx->screen, work_count, &classes);
 619
 620         assert(regs != NULL);
 621         assert(classes != NULL);
 622
 623        /* No register allocation to do with no SSA */
 624
 625         if (!ctx->temp_count)
 626                 return NULL;
 627
 628         /* Let's actually do register allocation */
 629         int nodes = ctx->temp_count;
 630         struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
 631
 632         /* Register class (as known to the Mesa register allocator) is actually
 633          * the product of both semantic class (work, load/store, texture..) and
 634          * size (vec2/vec3..). First, we'll go through and determine the
 635          * minimum size needed to hold values */
 636
 637         struct lcra_state *l = lcra_alloc_equations(ctx->temp_count, 1, 8, 16, 5);
 638
 639         /* Starts of classes, in bytes */
 640         l->class_start[REG_CLASS_WORK]  = 16 * 0;
 641         l->class_start[REG_CLASS_LDST]  = 16 * 26;
 642         l->class_start[REG_CLASS_TEXR]  = 16 * 28;
 643         l->class_start[REG_CLASS_TEXW]  = 16 * 28;
 644
 645         l->class_size[REG_CLASS_WORK] = 16 * work_count;
 646         l->class_size[REG_CLASS_LDST]  = 16 * 2;
 647         l->class_size[REG_CLASS_TEXR]  = 16 * 2;
 648         l->class_size[REG_CLASS_TEXW]  = 16 * 2;
 649
 650         lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW);
 651
 652         unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
 653
 654         mir_foreach_instr_global(ctx, ins) {
 655                 if (ins->dest >= SSA_FIXED_MINIMUM) continue;
 656
 657                 /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */
 658                 int class = util_logbase2(ins->mask);
 659
 660                 /* Use the largest class if there's ambiguity, this
 661                  * handles partial writes */
 662
 663                 int dest = ins->dest;
 664                 found_class[dest] = MAX2(found_class[dest], class);
 665
 666                 lcra_set_alignment(l, dest, 2); /* (1 << 2) = 4 */
 667
 668                 /* XXX: Ensure swizzles align the right way with more LCRA constraints? */
 669                 if (ins->type == TAG_ALU_4 && ins->alu.reg_mode != midgard_reg_mode_32)
 670                         lcra_set_alignment(l, dest, 3); /* (1 << 3) = 8 */
 671         }
 672
 673         for (unsigned i = 0; i < ctx->temp_count; ++i)
 674                 lcra_restrict_range(l, i, (found_class[i] + 1) * 4);
 675
 676         /* Next, we'll determine semantic class. We default to zero (work).
 677          * But, if we're used with a special operation, that will force us to a
 678          * particular class. Each node must be assigned to exactly one class; a
 679          * prepass before RA should have lowered what-would-have-been
 680          * multiclass nodes into a series of moves to break it up into multiple
 681          * nodes (TODO) */
 682
 683         mir_foreach_instr_global(ctx, ins) {
 684                 /* Check if this operation imposes any classes */
 685
 686                 if (ins->type == TAG_LOAD_STORE_4) {
 687                         bool force_vec4_only = OP_IS_VEC4_ONLY(ins->load_store.op);
 688
 689                         set_class(found_class, ins->src[0], REG_CLASS_LDST);
 690                         set_class(found_class, ins->src[1], REG_CLASS_LDST);
 691                         set_class(found_class, ins->src[2], REG_CLASS_LDST);
 692
 693                         if (force_vec4_only) {
 694                                 force_vec4(found_class, ins->dest);
 695                                 force_vec4(found_class, ins->src[0]);
 696                                 force_vec4(found_class, ins->src[1]);
 697                                 force_vec4(found_class, ins->src[2]);
 698
 699                                 lcra_restrict_range(l, ins->dest, 16);
 700                         }
 701                 } else if (ins->type == TAG_TEXTURE_4) {
 702                         set_class(found_class, ins->dest, REG_CLASS_TEXW);
 703                         set_class(found_class, ins->src[0], REG_CLASS_TEXR);
 704                         set_class(found_class, ins->src[1], REG_CLASS_TEXR);
 705                         set_class(found_class, ins->src[2], REG_CLASS_TEXR);
 706                 }
 707         }
 708
 709         /* Check that the semantics of the class are respected */
 710         mir_foreach_instr_global(ctx, ins) {
 711                 assert(check_write_class(found_class, ins->type, ins->dest));
 712                 assert(check_read_class(found_class, ins->type, ins->src[0]));
 713                 assert(check_read_class(found_class, ins->type, ins->src[1]));
 714                 assert(check_read_class(found_class, ins->type, ins->src[2]));
 715         }
 716
 717         /* Mark writeout to r0 */
 718         mir_foreach_instr_global(ctx, ins) {
 719                 if (ins->compact_branch && ins->writeout && ins->src[0] < ctx->temp_count)
 720                         l->solutions[ins->src[0]] = 0;
 721         }
 722
 723         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 724                 unsigned class = found_class[i];
 725                 l->class[i] = (class >> 2);
 726
 727                 ra_set_node_class(g, i, classes[class]);
 728         }
 729
 730         mir_compute_interference(ctx, g, l);
 731
 732         *spilled = !lcra_solve(l);
 733         return l;
 734 }
 735
 736 /* Once registers have been decided via register allocation
 737  * (allocate_registers), we need to rewrite the MIR to use registers instead of
 738  * indices */
 739
 740 static void
 741 install_registers_instr(
 742         compiler_context *ctx,
 743         struct lcra_state *l,
 744         midgard_instruction *ins)
 745 {
 746         switch (ins->type) {
 747         case TAG_ALU_4:
 748         case TAG_ALU_8:
 749         case TAG_ALU_12:
 750         case TAG_ALU_16: {
 751                  if (ins->compact_branch)
 752                          return;
 753
 754                 struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], mir_srcsize(ins, 0));
 755                 struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], mir_srcsize(ins, 1));
 756                 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, mir_typesize(ins));
 757
 758                 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
 759
 760                 unsigned dest_offset =
 761                         GET_CHANNEL_COUNT(alu_opcode_props[ins->alu.op].props) ? 0 :
 762                         dest.offset;
 763
 764                 offset_swizzle(ins->swizzle[0], src1.offset, src1.size, dest_offset);
 765
 766                 ins->registers.src1_reg = src1.reg;
 767
 768                 ins->registers.src2_imm = ins->has_inline_constant;
 769
 770                 if (ins->has_inline_constant) {
 771                         /* Encode inline 16-bit constant. See disassembler for
 772                          * where the algorithm is from */
 773
 774                         ins->registers.src2_reg = ins->inline_constant >> 11;
 775
 776                         int lower_11 = ins->inline_constant & ((1 << 12) - 1);
 777                         uint16_t imm = ((lower_11 >> 8) & 0x7) |
 778                                        ((lower_11 & 0xFF) << 3);
 779
 780                         ins->alu.src2 = imm << 2;
 781                 } else {
 782                         midgard_vector_alu_src mod2 =
 783                                 vector_alu_from_unsigned(ins->alu.src2);
 784                         offset_swizzle(ins->swizzle[1], src2.offset, src2.size, dest_offset);
 785                         ins->alu.src2 = vector_alu_srco_unsigned(mod2);
 786
 787                         ins->registers.src2_reg = src2.reg;
 788                 }
 789
 790                 ins->registers.out_reg = dest.reg;
 791                 break;
 792         }
 793
 794         case TAG_LOAD_STORE_4: {
 795                 /* Which physical register we read off depends on
 796                  * whether we are loading or storing -- think about the
 797                  * logical dataflow */
 798
 799                 bool encodes_src = OP_IS_STORE(ins->load_store.op);
 800
 801                 if (encodes_src) {
 802                         struct phys_reg src = index_to_reg(ctx, l, ins->src[0], mir_srcsize(ins, 0));
 803                         assert(src.reg == 26 || src.reg == 27);
 804
 805                         ins->load_store.reg = src.reg - 26;
 806                         offset_swizzle(ins->swizzle[0], src.offset, src.size, 0);
 807                } else {
 808                         struct phys_reg dst = index_to_reg(ctx, l, ins->dest, mir_typesize(ins));
 809
 810                         ins->load_store.reg = dst.reg;
 811                         offset_swizzle(ins->swizzle[0], 0, 4, dst.offset);
 812                         mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset);
 813                 }
 814
 815                 /* We also follow up by actual arguments */
 816
 817                 unsigned src2 = ins->src[1];
 818                 unsigned src3 = ins->src[2];
 819
 820                 if (src2 != ~0) {
 821                         struct phys_reg src = index_to_reg(ctx, l, src2, mir_srcsize(ins, 1));
 822                         unsigned component = src.offset / src.size;
 823                         assert(component * src.size == src.offset);
 824                         ins->load_store.arg_1 |= midgard_ldst_reg(src.reg, component);
 825                 }
 826
 827                 if (src3 != ~0) {
 828                         struct phys_reg src = index_to_reg(ctx, l, src3, mir_srcsize(ins, 2));
 829                         unsigned component = src.offset / src.size;
 830                         assert(component * src.size == src.offset);
 831                         ins->load_store.arg_2 |= midgard_ldst_reg(src.reg, component);
 832                 }
 833
 834                 break;
 835         }
 836
 837         case TAG_TEXTURE_4: {
 838                 /* Grab RA results */
 839                 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, mir_typesize(ins));
 840                 struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], mir_srcsize(ins, 1));
 841                 struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], mir_srcsize(ins, 2));
 842
 843                 assert(dest.reg == 28 || dest.reg == 29);
 844                 assert(coord.reg == 28 || coord.reg == 29);
 845
 846                 /* First, install the texture coordinate */
 847                 ins->texture.in_reg_full = 1;
 848                 ins->texture.in_reg_upper = 0;
 849                 ins->texture.in_reg_select = coord.reg - 28;
 850                 offset_swizzle(ins->swizzle[1], coord.offset, coord.size, 0);
 851
 852                 /* Next, install the destination */
 853                 ins->texture.out_full = 1;
 854                 ins->texture.out_upper = 0;
 855                 ins->texture.out_reg_select = dest.reg - 28;
 856                 offset_swizzle(ins->swizzle[0], 0, 4, dest.offset);
 857                 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
 858
 859                 /* If there is a register LOD/bias, use it */
 860                 if (ins->src[2] != ~0) {
 861                         assert(!(lod.offset & 3));
 862                         midgard_tex_register_select sel = {
 863                                 .select = lod.reg,
 864                                 .full = 1,
 865                                 .component = lod.offset / 4
 866                         };
 867
 868                         uint8_t packed;
 869                         memcpy(&packed, &sel, sizeof(packed));
 870                         ins->texture.bias = packed;
 871                 }
 872
 873                 break;
 874         }
 875
 876         default:
 877                 break;
 878         }
 879 }
 880
 881 void
 882 install_registers(compiler_context *ctx, struct lcra_state *l)
 883 {
 884         mir_foreach_instr_global(ctx, ins)
 885                 install_registers_instr(ctx, l, ins);
 886 }