src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
  30  * must be grouped into VLIW bundles according to following model:
  31  *
  32  * [VMUL] [SADD]
  33  * [VADD] [SMUL] [VLUT]
  34  *
  35  * A given instruction can execute on some subset of the units (or a few can
  36  * execute on all). Instructions can be either vector or scalar; only scalar
  37  * instructions can execute on SADD/SMUL units. Units on a given line execute
  38  * in parallel. Subsequent lines execute separately and can pass results
  39  * directly via pipeline registers r24/r25, bypassing the register file.
  40  *
  41  * A bundle can optionally have 128-bits of embedded constants, shared across
  42  * all of the instructions within a bundle.
  43  *
  44  * Instructions consuming conditionals (branches and conditional selects)
  45  * require their condition to be written into the conditional register (r31)
  46  * within the same bundle they are consumed.
  47  *
  48  * Fragment writeout requires its argument to be written in full within the
  49  * same bundle as the branch, with no hanging dependencies.
  50  *
  51  * Load/store instructions are also in bundles of simply two instructions, and
  52  * texture instructions have no bundling.
  53  *
  54  * -------------------------------------------------------------------------
  55  *
  56  */
  57
  58 /* Create a mask of accessed components from a swizzle to figure out vector
  59  * dependencies */
  60
  61 static unsigned
  62 swizzle_to_access_mask(unsigned swizzle)
  63 {
  64         unsigned component_mask = 0;
  65
  66         for (int i = 0; i < 4; ++i) {
  67                 unsigned c = (swizzle >> (2 * i)) & 3;
  68                 component_mask |= (1 << c);
  69         }
  70
  71         return component_mask;
  72 }
  73
  74 /* Does the mask cover more than a scalar? */
  75
  76 static bool
  77 is_single_component_mask(unsigned mask)
  78 {
  79         int components = 0;
  80
  81         for (int c = 0; c < 8; ++c) {
  82                 if (mask & (1 << c))
  83                         components++;
  84         }
  85
  86         return components == 1;
  87 }
  88
  89 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
  90  * mind that we are a vector architecture and we can write to different
  91  * components simultaneously */
  92
  93 static bool
  94 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
  95 {
  96         /* Writeout has its own rules anyway */
  97         if (first->compact_branch || second->compact_branch)
  98                 return true;
  99
 100         /* Each instruction reads some registers and writes to a register. See
 101          * where the first writes */
 102
 103         int source = first->dest;
 104         int source_mask = first->mask;
 105
 106         /* As long as the second doesn't read from the first, we're okay */
 107         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
 108                 if (second->src[i] != source)
 109                         continue;
 110
 111                 if (first->type != TAG_ALU_4)
 112                         return false;
 113
 114                 /* Figure out which components we just read from */
 115
 116                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
 117                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
 118
 119                 /* Check if there are components in common, and fail if so */
 120                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
 121                         return false;
 122         }
 123
 124         /* Otherwise, it's safe in that regard. Another data hazard is both
 125          * writing to the same place, of course */
 126
 127         if (second->dest == source) {
 128                 /* ...but only if the components overlap */
 129
 130                 if (second->mask & source_mask)
 131                         return false;
 132         }
 133
 134         /* ...That's it */
 135         return true;
 136 }
 137
 138 static bool
 139 midgard_has_hazard(
 140         midgard_instruction **segment, unsigned segment_size,
 141         midgard_instruction *ains)
 142 {
 143         for (int s = 0; s < segment_size; ++s)
 144                 if (!can_run_concurrent_ssa(segment[s], ains))
 145                         return true;
 146
 147         return false;
 148
 149
 150 }
 151
 152 /* Fragment writeout (of r0) is allowed when:
 153  *
 154  *  - All components of r0 are written in the bundle
 155  *  - No components of r0 are written in VLUT
 156  *  - Non-pipelined dependencies of r0 are not written in the bundle
 157  *
 158  * This function checks if these requirements are satisfied given the content
 159  * of a scheduled bundle.
 160  */
 161
 162 static bool
 163 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count)
 164 {
 165         /* First scan for which components of r0 are written out. Initially
 166          * none are written */
 167
 168         uint8_t r0_written_mask = 0x0;
 169
 170         /* Simultaneously we scan for the set of dependencies */
 171
 172         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 173         BITSET_WORD *dependencies = alloca(sz);
 174         memset(dependencies, 0, sz);
 175
 176         for (unsigned i = 0; i < count; ++i) {
 177                 midgard_instruction *ins = bundle[i];
 178
 179                 if (ins->dest != SSA_FIXED_REGISTER(0))
 180                         continue;
 181
 182                 /* Record written out mask */
 183                 r0_written_mask |= ins->mask;
 184
 185                 /* Record dependencies, but only if they won't become pipeline
 186                  * registers. We know we can't be live after this, because
 187                  * we're writeout at the very end of the shader. So check if
 188                  * they were written before us. */
 189
 190                 unsigned src0 = ins->src[0];
 191                 unsigned src1 = ins->src[1];
 192
 193                 if (!mir_is_written_before(ctx, bundle[0], src0))
 194                         src0 = ~0;
 195
 196                 if (!mir_is_written_before(ctx, bundle[0], src1))
 197                         src1 = ~0;
 198
 199                 if (src0 < node_count)
 200                         BITSET_SET(dependencies, src0);
 201
 202                 if (src1 < node_count)
 203                         BITSET_SET(dependencies, src1);
 204
 205                 /* Requirement 2 */
 206                 if (ins->unit == UNIT_VLUT)
 207                         return false;
 208         }
 209
 210         /* Requirement 1 */
 211         if ((r0_written_mask & 0xF) != 0xF)
 212                 return false;
 213
 214         /* Requirement 3 */
 215
 216         for (unsigned i = 0; i < count; ++i) {
 217                 unsigned dest = bundle[i]->dest;
 218
 219                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 220                         return false;
 221         }
 222
 223         /* Otherwise, we're good to go */
 224         return true;
 225 }
 226
 227 /* Helpers for scheudling */
 228
 229 static bool
 230 mir_is_scalar(midgard_instruction *ains)
 231 {
 232         /* Does the op support scalar units? */
 233         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 234                 return false;
 235
 236         /* Do we try to use it as a vector op? */
 237         if (!is_single_component_mask(ains->mask))
 238                 return false;
 239
 240         /* Otherwise, check mode hazards */
 241         bool could_scalar = true;
 242
 243         /* Only 16/32-bit can run on a scalar unit */
 244         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 245         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 246         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 247
 248         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 249                 /* If we're running in 16-bit mode, we
 250                  * can't have any 8-bit sources on the
 251                  * scalar unit (since the scalar unit
 252                  * doesn't understand 8-bit) */
 253
 254                 midgard_vector_alu_src s1 =
 255                         vector_alu_from_unsigned(ains->alu.src1);
 256
 257                 could_scalar &= !s1.half;
 258
 259                 midgard_vector_alu_src s2 =
 260                         vector_alu_from_unsigned(ains->alu.src2);
 261
 262                 could_scalar &= !s2.half;
 263         }
 264
 265         return could_scalar;
 266 }
 267
 268 /* How many bytes does this ALU instruction add to the bundle? */
 269
 270 static unsigned
 271 bytes_for_instruction(midgard_instruction *ains)
 272 {
 273         if (ains->unit & UNITS_ANY_VECTOR)
 274                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 275         else if (ains->unit == ALU_ENAB_BRANCH)
 276                 return sizeof(midgard_branch_extended);
 277         else if (ains->compact_branch)
 278                 return sizeof(ains->br_compact);
 279         else
 280                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 281 }
 282
 283 /* Schedules, but does not emit, a single basic block. After scheduling, the
 284  * final tag and size of the block are known, which are necessary for branching
 285  * */
 286
 287 static midgard_bundle
 288 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 289 {
 290         int instructions_emitted = 0, packed_idx = 0;
 291         midgard_bundle bundle = { 0 };
 292
 293         midgard_instruction *scheduled[5] = { NULL };
 294
 295         uint8_t tag = ins->type;
 296
 297         /* Default to the instruction's tag */
 298         bundle.tag = tag;
 299
 300         switch (ins->type) {
 301         case TAG_ALU_4: {
 302                 uint32_t control = 0;
 303                 size_t bytes_emitted = sizeof(control);
 304
 305                 /* TODO: Constant combining */
 306                 int index = 0, last_unit = 0;
 307
 308                 /* Previous instructions, for the purpose of parallelism */
 309                 midgard_instruction *segment[4] = {0};
 310                 int segment_size = 0;
 311
 312                 instructions_emitted = -1;
 313                 midgard_instruction *pins = ins;
 314
 315                 unsigned constant_count = 0;
 316
 317                 for (;;) {
 318                         midgard_instruction *ains = pins;
 319
 320                         /* Advance instruction pointer */
 321                         if (index) {
 322                                 ains = mir_next_op(pins);
 323                                 pins = ains;
 324                         }
 325
 326                         /* Out-of-work condition */
 327                         if ((struct list_head *) ains == &block->instructions)
 328                                 break;
 329
 330                         /* Ensure that the chain can continue */
 331                         if (ains->type != TAG_ALU_4) break;
 332
 333                         /* If there's already something in the bundle and we
 334                          * have weird scheduler constraints, break now */
 335                         if (ains->precede_break && index) break;
 336
 337                         /* According to the presentation "The ARM
 338                          * Mali-T880 Mobile GPU" from HotChips 27,
 339                          * there are two pipeline stages. Branching
 340                          * position determined experimentally. Lines
 341                          * are executed in parallel:
 342                          *
 343                          * [ VMUL ] [ SADD ]
 344                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 345                          *
 346                          * Verify that there are no ordering dependencies here.
 347                          *
 348                          * TODO: Allow for parallelism!!!
 349                          */
 350
 351                         /* Pick a unit for it if it doesn't force a particular unit */
 352
 353                         int unit = ains->unit;
 354
 355                         if (!unit) {
 356                                 int op = ains->alu.op;
 357                                 int units = alu_opcode_props[op].props;
 358                                 bool scalar = mir_is_scalar(ains);
 359
 360                                 if (!scalar) {
 361                                         if (last_unit >= UNIT_VADD) {
 362                                                 if (units & UNIT_VLUT)
 363                                                         unit = UNIT_VLUT;
 364                                                 else
 365                                                         break;
 366                                         } else {
 367                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 368                                                         unit = UNIT_VMUL;
 369                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 370                                                         unit = UNIT_VADD;
 371                                                 else if (units & UNIT_VLUT)
 372                                                         unit = UNIT_VLUT;
 373                                                 else
 374                                                         break;
 375                                         }
 376                                 } else {
 377                                         if (last_unit >= UNIT_VADD) {
 378                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 379                                                         unit = UNIT_SMUL;
 380                                                 else if (units & UNIT_VLUT)
 381                                                         unit = UNIT_VLUT;
 382                                                 else
 383                                                         break;
 384                                         } else {
 385                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 386                                                         unit = UNIT_VMUL;
 387                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 388                                                         unit = UNIT_SADD;
 389                                                 else if (units & UNIT_VADD)
 390                                                         unit = UNIT_VADD;
 391                                                 else if (units & UNIT_SMUL)
 392                                                         unit = UNIT_SMUL;
 393                                                 else if (units & UNIT_VLUT)
 394                                                         unit = UNIT_VLUT;
 395                                                 else
 396                                                         break;
 397                                         }
 398                                 }
 399
 400                                 assert(unit & units);
 401                         }
 402
 403                         /* Late unit check, this time for encoding (not parallelism) */
 404                         if (unit <= last_unit) break;
 405
 406                         /* Clear the segment */
 407                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 408                                 segment_size = 0;
 409
 410                         if (midgard_has_hazard(segment, segment_size, ains))
 411                                 break;
 412
 413                         /* We're good to go -- emit the instruction */
 414                         ains->unit = unit;
 415
 416                         segment[segment_size++] = ains;
 417
 418                         /* We try to reuse constants if possible, by adjusting
 419                          * the swizzle */
 420
 421                         if (ains->has_blend_constant) {
 422                                 /* Everything conflicts with the blend constant */
 423                                 if (bundle.has_embedded_constants)
 424                                         break;
 425
 426                                 bundle.has_blend_constant = 1;
 427                                 bundle.has_embedded_constants = 1;
 428                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 429                                 /* TODO: DRY with the analysis pass */
 430
 431                                 if (bundle.has_blend_constant)
 432                                         break;
 433
 434                                 if (constant_count)
 435                                         break;
 436
 437                                 /* TODO: Fix packing XXX */
 438                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 439                                 uint32_t *constants = (uint32_t *) ains->constants;
 440
 441                                 /* Copy them wholesale */
 442                                 for (unsigned i = 0; i < 4; ++i)
 443                                         bundles[i] = constants[i];
 444
 445                                 bundle.has_embedded_constants = true;
 446                                 constant_count = 4;
 447                         } else if (ains->has_constants) {
 448                                 /* By definition, blend constants conflict with
 449                                  * everything, so if there are already
 450                                  * constants we break the bundle *now* */
 451
 452                                 if (bundle.has_blend_constant)
 453                                         break;
 454
 455                                 /* For anything but blend constants, we can do
 456                                  * proper analysis, however */
 457
 458                                 /* TODO: Mask by which are used */
 459                                 uint32_t *constants = (uint32_t *) ains->constants;
 460                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 461
 462                                 uint32_t indices[4] = { 0 };
 463                                 bool break_bundle = false;
 464
 465                                 for (unsigned i = 0; i < 4; ++i) {
 466                                         uint32_t cons = constants[i];
 467                                         bool constant_found = false;
 468
 469                                         /* Search for the constant */
 470                                         for (unsigned j = 0; j < constant_count; ++j) {
 471                                                 if (bundles[j] != cons)
 472                                                         continue;
 473
 474                                                 /* We found it, reuse */
 475                                                 indices[i] = j;
 476                                                 constant_found = true;
 477                                                 break;
 478                                         }
 479
 480                                         if (constant_found)
 481                                                 continue;
 482
 483                                         /* We didn't find it, so allocate it */
 484                                         unsigned idx = constant_count++;
 485
 486                                         if (idx >= 4) {
 487                                                 /* Uh-oh, out of space */
 488                                                 break_bundle = true;
 489                                                 break;
 490                                         }
 491
 492                                         /* We have space, copy it in! */
 493                                         bundles[idx] = cons;
 494                                         indices[i] = idx;
 495                                 }
 496
 497                                 if (break_bundle)
 498                                         break;
 499
 500                                 /* Cool, we have it in. So use indices as a
 501                                  * swizzle */
 502
 503                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 504                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 505
 506                                 if (ains->src[0] == r_constant)
 507                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 508
 509                                 if (ains->src[1] == r_constant)
 510                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 511
 512                                 bundle.has_embedded_constants = true;
 513                         }
 514
 515                         if (ains->compact_branch) {
 516                                 /* All of r0 has to be written out along with
 517                                  * the branch writeout */
 518
 519                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) {
 520                                         /* We only work on full moves
 521                                          * at the beginning. We could
 522                                          * probably do better */
 523                                         if (index != 0)
 524                                                 break;
 525
 526                                         /* Inject a move */
 527                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 528                                         ins.unit = UNIT_VMUL;
 529                                         control |= ins.unit;
 530
 531                                         /* TODO don't leak */
 532                                         midgard_instruction *move =
 533                                                 mem_dup(&ins, sizeof(midgard_instruction));
 534                                         bytes_emitted += bytes_for_instruction(move);
 535                                         bundle.instructions[packed_idx++] = move;
 536                                 }
 537                         }
 538
 539                         bytes_emitted += bytes_for_instruction(ains);
 540
 541                         /* Defer marking until after writing to allow for break */
 542                         scheduled[index] = ains;
 543                         control |= ains->unit;
 544                         last_unit = ains->unit;
 545                         ++instructions_emitted;
 546                         ++index;
 547                 }
 548
 549                 int padding = 0;
 550
 551                 /* Pad ALU op to nearest word */
 552
 553                 if (bytes_emitted & 15) {
 554                         padding = 16 - (bytes_emitted & 15);
 555                         bytes_emitted += padding;
 556                 }
 557
 558                 /* Constants must always be quadwords */
 559                 if (bundle.has_embedded_constants)
 560                         bytes_emitted += 16;
 561
 562                 /* Size ALU instruction for tag */
 563                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 564                 bundle.padding = padding;
 565                 bundle.control = bundle.tag | control;
 566
 567                 break;
 568         }
 569
 570         case TAG_LOAD_STORE_4: {
 571                 /* Load store instructions have two words at once. If
 572                  * we only have one queued up, we need to NOP pad.
 573                  * Otherwise, we store both in succession to save space
 574                  * and cycles -- letting them go in parallel -- skip
 575                  * the next. The usefulness of this optimisation is
 576                  * greatly dependent on the quality of the instruction
 577                  * scheduler.
 578                  */
 579
 580                 midgard_instruction *next_op = mir_next_op(ins);
 581
 582                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 583                         /* TODO: Concurrency check */
 584                         instructions_emitted++;
 585                 }
 586
 587                 break;
 588         }
 589
 590         case TAG_TEXTURE_4: {
 591                 /* Which tag we use depends on the shader stage */
 592                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 593                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 594                 break;
 595         }
 596
 597         default:
 598                 unreachable("Unknown tag");
 599                 break;
 600         }
 601
 602         /* Copy the instructions into the bundle */
 603         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 604
 605         midgard_instruction *uins = ins;
 606         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 607                 assert(&uins->link != &block->instructions);
 608                 bundle.instructions[packed_idx] = uins;
 609                 uins = mir_next_op(uins);
 610         }
 611
 612         *skip = instructions_emitted;
 613
 614         return bundle;
 615 }
 616
 617 /* Schedule a single block by iterating its instruction to create bundles.
 618  * While we go, tally about the bundle sizes to compute the block size. */
 619
 620 static void
 621 schedule_block(compiler_context *ctx, midgard_block *block)
 622 {
 623         util_dynarray_init(&block->bundles, NULL);
 624
 625         block->quadword_count = 0;
 626
 627         mir_foreach_instr_in_block(block, ins) {
 628                 int skip;
 629                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
 630                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
 631
 632                 if (bundle.has_blend_constant) {
 633                         unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
 634                         ctx->blend_constant_offset = offset * 0x10;
 635                 }
 636
 637                 while(skip--)
 638                         ins = mir_next_op(ins);
 639
 640                 block->quadword_count += quadword_size(bundle.tag);
 641         }
 642
 643         block->is_scheduled = true;
 644         ctx->quadword_count += block->quadword_count;
 645 }
 646
 647 /* The following passes reorder MIR instructions to enable better scheduling */
 648
 649 static void
 650 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
 651 {
 652         mir_foreach_instr_in_block_safe(block, ins) {
 653                 if (ins->type != TAG_LOAD_STORE_4) continue;
 654
 655                 /* We've found a load/store op. Check if next is also load/store. */
 656                 midgard_instruction *next_op = mir_next_op(ins);
 657                 if (&next_op->link != &block->instructions) {
 658                         if (next_op->type == TAG_LOAD_STORE_4) {
 659                                 /* If so, we're done since we're a pair */
 660                                 ins = mir_next_op(ins);
 661                                 continue;
 662                         }
 663
 664                         /* Maximum search distance to pair, to avoid register pressure disasters */
 665                         int search_distance = 8;
 666
 667                         /* Otherwise, we have an orphaned load/store -- search for another load */
 668                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
 669                                 /* Terminate search if necessary */
 670                                 if (!(search_distance--)) break;
 671
 672                                 if (c->type != TAG_LOAD_STORE_4) continue;
 673
 674                                 /* We can only reorder if there are no sources */
 675
 676                                 bool deps = false;
 677
 678                                 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
 679                                         deps |= (c->src[s] != ~0);
 680
 681                                 if (deps)
 682                                         continue;
 683
 684                                 /* We found one! Move it up to pair and remove it from the old location */
 685
 686                                 mir_insert_instruction_before(ctx, ins, *c);
 687                                 mir_remove_instruction(c);
 688
 689                                 break;
 690                         }
 691                 }
 692         }
 693 }
 694
 695 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 696  * as such */
 697
 698 static unsigned
 699 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 700 {
 701         if (hash >= SSA_FIXED_MINIMUM)
 702                 return hash;
 703
 704         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
 705                                 ctx->hash_to_temp, hash + 1);
 706
 707         if (temp)
 708                 return temp - 1;
 709
 710         /* If no temp is find, allocate one */
 711         temp = ctx->temp_count++;
 712         ctx->max_hash = MAX2(ctx->max_hash, hash);
 713
 714         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
 715                                     hash + 1, (void *) ((uintptr_t) temp + 1));
 716
 717         return temp;
 718 }
 719
 720 /* Reassigns numbering to get rid of gaps in the indices */
 721
 722 static void
 723 mir_squeeze_index(compiler_context *ctx)
 724 {
 725         /* Reset */
 726         ctx->temp_count = 0;
 727         /* TODO don't leak old hash_to_temp */
 728         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
 729
 730         mir_foreach_instr_global(ctx, ins) {
 731                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
 732
 733                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
 734                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
 735         }
 736 }
 737
 738 static midgard_instruction
 739 v_load_store_scratch(
 740                 unsigned srcdest,
 741                 unsigned index,
 742                 bool is_store,
 743                 unsigned mask)
 744 {
 745         /* We index by 32-bit vec4s */
 746         unsigned byte = (index * 4 * 4);
 747
 748         midgard_instruction ins = {
 749                 .type = TAG_LOAD_STORE_4,
 750                 .mask = mask,
 751                 .dest = ~0,
 752                 .src = { ~0, ~0, ~0 },
 753                 .load_store = {
 754                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
 755                         .swizzle = SWIZZLE_XYZW,
 756
 757                         /* For register spilling - to thread local storage */
 758                         .arg_1 = 0xEA,
 759                         .arg_2 = 0x1E,
 760
 761                         /* Splattered across, TODO combine logically */
 762                         .varying_parameters = (byte & 0x1FF) << 1,
 763                         .address = (byte >> 9)
 764                 },
 765
 766                 /* If we spill an unspill, RA goes into an infinite loop */
 767                 .no_spill = true
 768         };
 769
 770        if (is_store) {
 771                 /* r0 = r26, r1 = r27 */
 772                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
 773                 ins.src[0] = srcdest;
 774         } else {
 775                 ins.dest = srcdest;
 776         }
 777
 778         return ins;
 779 }
 780
 781 /* If register allocation fails, find the best spill node and spill it to fix
 782  * whatever the issue was. This spill node could be a work register (spilling
 783  * to thread local storage), but it could also simply be a special register
 784  * that needs to spill to become a work register. */
 785
 786 static void mir_spill_register(
 787                 compiler_context *ctx,
 788                 struct ra_graph *g,
 789                 unsigned *spill_count)
 790 {
 791         unsigned spill_index = ctx->temp_count;
 792
 793         /* Our first step is to calculate spill cost to figure out the best
 794          * spill node. All nodes are equal in spill cost, but we can't spill
 795          * nodes written to from an unspill */
 796
 797         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 798                 ra_set_node_spill_cost(g, i, 1.0);
 799         }
 800
 801         mir_foreach_instr_global(ctx, ins) {
 802                 if (ins->no_spill &&
 803                     ins->dest >= 0 &&
 804                     ins->dest < ctx->temp_count)
 805                         ra_set_node_spill_cost(g, ins->dest, -1.0);
 806         }
 807
 808         int spill_node = ra_get_best_spill_node(g);
 809
 810         if (spill_node < 0) {
 811                 mir_print_shader(ctx);
 812                 assert(0);
 813         }
 814
 815         /* We have a spill node, so check the class. Work registers
 816          * legitimately spill to TLS, but special registers just spill to work
 817          * registers */
 818
 819         unsigned class = ra_get_node_class(g, spill_node);
 820         bool is_special = (class >> 2) != REG_CLASS_WORK;
 821         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
 822
 823         /* Allocate TLS slot (maybe) */
 824         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
 825
 826         /* For TLS, replace all stores to the spilled node. For
 827          * special reads, just keep as-is; the class will be demoted
 828          * implicitly. For special writes, spill to a work register */
 829
 830         if (!is_special || is_special_w) {
 831                 if (is_special_w)
 832                         spill_slot = spill_index++;
 833
 834                 mir_foreach_instr_global_safe(ctx, ins) {
 835                         if (ins->dest != spill_node) continue;
 836
 837                         midgard_instruction st;
 838
 839                         if (is_special_w) {
 840                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
 841                                 st.no_spill = true;
 842                         } else {
 843                                 ins->dest = SSA_FIXED_REGISTER(26);
 844                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
 845                         }
 846
 847                         /* Hint: don't rewrite this node */
 848                         st.hint = true;
 849
 850                         mir_insert_instruction_before(ctx, mir_next_op(ins), st);
 851
 852                         if (!is_special)
 853                                 ctx->spills++;
 854                 }
 855         }
 856
 857         /* For special reads, figure out how many components we need */
 858         unsigned read_mask = 0;
 859
 860         mir_foreach_instr_global_safe(ctx, ins) {
 861                 read_mask |= mir_mask_of_read_components(ins, spill_node);
 862         }
 863
 864         /* Insert a load from TLS before the first consecutive
 865          * use of the node, rewriting to use spilled indices to
 866          * break up the live range. Or, for special, insert a
 867          * move. Ironically the latter *increases* register
 868          * pressure, but the two uses of the spilling mechanism
 869          * are somewhat orthogonal. (special spilling is to use
 870          * work registers to back special registers; TLS
 871          * spilling is to use memory to back work registers) */
 872
 873         mir_foreach_block(ctx, block) {
 874                 bool consecutive_skip = false;
 875                 unsigned consecutive_index = 0;
 876
 877                 mir_foreach_instr_in_block(block, ins) {
 878                         /* We can't rewrite the moves used to spill in the
 879                          * first place. These moves are hinted. */
 880                         if (ins->hint) continue;
 881
 882                         if (!mir_has_arg(ins, spill_node)) {
 883                                 consecutive_skip = false;
 884                                 continue;
 885                         }
 886
 887                         if (consecutive_skip) {
 888                                 /* Rewrite */
 889                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
 890                                 continue;
 891                         }
 892
 893                         if (!is_special_w) {
 894                                 consecutive_index = ++spill_index;
 895
 896                                 midgard_instruction *before = ins;
 897
 898                                 /* For a csel, go back one more not to break up the bundle */
 899                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
 900                                         before = mir_prev_op(before);
 901
 902                                 midgard_instruction st;
 903
 904                                 if (is_special) {
 905                                         /* Move */
 906                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
 907                                         st.no_spill = true;
 908                                 } else {
 909                                         /* TLS load */
 910                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
 911                                 }
 912
 913                                 /* Mask the load based on the component count
 914                                  * actually needed to prvent RA loops */
 915
 916                                 st.mask = read_mask;
 917
 918                                 mir_insert_instruction_before(ctx, before, st);
 919                                // consecutive_skip = true;
 920                         } else {
 921                                 /* Special writes already have their move spilled in */
 922                                 consecutive_index = spill_slot;
 923                         }
 924
 925
 926                         /* Rewrite to use */
 927                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
 928
 929                         if (!is_special)
 930                                 ctx->fills++;
 931                 }
 932         }
 933
 934         /* Reset hints */
 935
 936         mir_foreach_instr_global(ctx, ins) {
 937                 ins->hint = false;
 938         }
 939 }
 940
 941 void
 942 schedule_program(compiler_context *ctx)
 943 {
 944         struct ra_graph *g = NULL;
 945         bool spilled = false;
 946         int iter_count = 1000; /* max iterations */
 947
 948         /* Number of 128-bit slots in memory we've spilled into */
 949         unsigned spill_count = 0;
 950
 951         midgard_promote_uniforms(ctx, 16);
 952
 953         mir_foreach_block(ctx, block) {
 954                 midgard_pair_load_store(ctx, block);
 955         }
 956
 957         /* Must be lowered right before RA */
 958         mir_squeeze_index(ctx);
 959         mir_lower_special_reads(ctx);
 960
 961         /* Lowering can introduce some dead moves */
 962
 963         mir_foreach_block(ctx, block) {
 964                 midgard_opt_dead_move_eliminate(ctx, block);
 965         }
 966
 967         do {
 968                 if (spilled)
 969                         mir_spill_register(ctx, g, &spill_count);
 970
 971                 mir_squeeze_index(ctx);
 972
 973                 g = NULL;
 974                 g = allocate_registers(ctx, &spilled);
 975         } while(spilled && ((iter_count--) > 0));
 976
 977         /* We can simplify a bit after RA */
 978
 979         mir_foreach_block(ctx, block) {
 980                 midgard_opt_post_move_eliminate(ctx, block, g);
 981         }
 982
 983         /* After RA finishes, we schedule all at once */
 984
 985         mir_foreach_block(ctx, block) {
 986                 schedule_block(ctx, block);
 987         }
 988
 989         /* Finally, we create pipeline registers as a peephole pass after
 990          * scheduling. This isn't totally optimal, since there are cases where
 991          * the usage of pipeline registers can eliminate spills, but it does
 992          * save some power */
 993
 994         mir_create_pipeline_registers(ctx);
 995
 996         if (iter_count <= 0) {
 997                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
 998                 assert(0);
 999         }
1000
1001         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1002          * fp32), but tls_size is in bytes, so multiply by 16 */
1003
1004         ctx->tls_size = spill_count * 16;
1005
1006         install_registers(ctx, g);
1007 }