src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Create a mask of accessed components from a swizzle to figure out vector
  30  * dependencies */
  31
  32 static unsigned
  33 swizzle_to_access_mask(unsigned swizzle)
  34 {
  35         unsigned component_mask = 0;
  36
  37         for (int i = 0; i < 4; ++i) {
  38                 unsigned c = (swizzle >> (2 * i)) & 3;
  39                 component_mask |= (1 << c);
  40         }
  41
  42         return component_mask;
  43 }
  44
  45 /* Does the mask cover more than a scalar? */
  46
  47 static bool
  48 is_single_component_mask(unsigned mask)
  49 {
  50         int components = 0;
  51
  52         for (int c = 0; c < 8; ++c) {
  53                 if (mask & (1 << c))
  54                         components++;
  55         }
  56
  57         return components == 1;
  58 }
  59
  60 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
  61  * mind that we are a vector architecture and we can write to different
  62  * components simultaneously */
  63
  64 static bool
  65 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
  66 {
  67         /* Writeout has its own rules anyway */
  68         if (first->compact_branch || second->compact_branch)
  69                 return true;
  70
  71         /* Each instruction reads some registers and writes to a register. See
  72          * where the first writes */
  73
  74         int source = first->dest;
  75         int source_mask = first->mask;
  76
  77         /* As long as the second doesn't read from the first, we're okay */
  78         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
  79                 if (second->src[i] != source)
  80                         continue;
  81
  82                 if (first->type != TAG_ALU_4)
  83                         return false;
  84
  85                 /* Figure out which components we just read from */
  86
  87                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
  88                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
  89
  90                 /* Check if there are components in common, and fail if so */
  91                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
  92                         return false;
  93         }
  94
  95         /* Otherwise, it's safe in that regard. Another data hazard is both
  96          * writing to the same place, of course */
  97
  98         if (second->dest == source) {
  99                 /* ...but only if the components overlap */
 100
 101                 if (second->mask & source_mask)
 102                         return false;
 103         }
 104
 105         /* ...That's it */
 106         return true;
 107 }
 108
 109 static bool
 110 midgard_has_hazard(
 111         midgard_instruction **segment, unsigned segment_size,
 112         midgard_instruction *ains)
 113 {
 114         for (int s = 0; s < segment_size; ++s)
 115                 if (!can_run_concurrent_ssa(segment[s], ains))
 116                         return true;
 117
 118         return false;
 119
 120
 121 }
 122
 123 /* Fragment writeout (of r0) is allowed when:
 124  *
 125  *  - All components of r0 are written in the bundle
 126  *  - No components of r0 are written in VLUT
 127  *  - Non-pipelined dependencies of r0 are not written in the bundle
 128  *
 129  * This function checks if these requirements are satisfied given the content
 130  * of a scheduled bundle.
 131  */
 132
 133 static bool
 134 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count)
 135 {
 136         /* First scan for which components of r0 are written out. Initially
 137          * none are written */
 138
 139         uint8_t r0_written_mask = 0x0;
 140
 141         /* Simultaneously we scan for the set of dependencies */
 142
 143         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 144         BITSET_WORD *dependencies = alloca(sz);
 145         memset(dependencies, 0, sz);
 146
 147         for (unsigned i = 0; i < count; ++i) {
 148                 midgard_instruction *ins = bundle[i];
 149
 150                 if (ins->dest != SSA_FIXED_REGISTER(0))
 151                         continue;
 152
 153                 /* Record written out mask */
 154                 r0_written_mask |= ins->mask;
 155
 156                 /* Record dependencies, but only if they won't become pipeline
 157                  * registers. We know we can't be live after this, because
 158                  * we're writeout at the very end of the shader. So check if
 159                  * they were written before us. */
 160
 161                 unsigned src0 = ins->src[0];
 162                 unsigned src1 = ins->src[1];
 163
 164                 if (!mir_is_written_before(ctx, bundle[0], src0))
 165                         src0 = ~0;
 166
 167                 if (!mir_is_written_before(ctx, bundle[0], src1))
 168                         src1 = ~0;
 169
 170                 if (src0 < node_count)
 171                         BITSET_SET(dependencies, src0);
 172
 173                 if (src1 < node_count)
 174                         BITSET_SET(dependencies, src1);
 175
 176                 /* Requirement 2 */
 177                 if (ins->unit == UNIT_VLUT)
 178                         return false;
 179         }
 180
 181         /* Requirement 1 */
 182         if ((r0_written_mask & 0xF) != 0xF)
 183                 return false;
 184
 185         /* Requirement 3 */
 186
 187         for (unsigned i = 0; i < count; ++i) {
 188                 unsigned dest = bundle[i]->dest;
 189
 190                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 191                         return false;
 192         }
 193
 194         /* Otherwise, we're good to go */
 195         return true;
 196 }
 197
 198 /* Helpers for scheudling */
 199
 200 static bool
 201 mir_is_scalar(midgard_instruction *ains)
 202 {
 203         /* Does the op support scalar units? */
 204         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 205                 return false;
 206
 207         /* Do we try to use it as a vector op? */
 208         if (!is_single_component_mask(ains->mask))
 209                 return false;
 210
 211         /* Otherwise, check mode hazards */
 212         bool could_scalar = true;
 213
 214         /* Only 16/32-bit can run on a scalar unit */
 215         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 216         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 217         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 218
 219         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 220                 /* If we're running in 16-bit mode, we
 221                  * can't have any 8-bit sources on the
 222                  * scalar unit (since the scalar unit
 223                  * doesn't understand 8-bit) */
 224
 225                 midgard_vector_alu_src s1 =
 226                         vector_alu_from_unsigned(ains->alu.src1);
 227
 228                 could_scalar &= !s1.half;
 229
 230                 midgard_vector_alu_src s2 =
 231                         vector_alu_from_unsigned(ains->alu.src2);
 232
 233                 could_scalar &= !s2.half;
 234         }
 235
 236         return could_scalar;
 237 }
 238
 239 /* How many bytes does this ALU instruction add to the bundle? */
 240
 241 static unsigned
 242 bytes_for_instruction(midgard_instruction *ains)
 243 {
 244         if (ains->unit & UNITS_ANY_VECTOR)
 245                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 246         else if (ains->unit == ALU_ENAB_BRANCH)
 247                 return sizeof(midgard_branch_extended);
 248         else if (ains->compact_branch)
 249                 return sizeof(ains->br_compact);
 250         else
 251                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 252 }
 253
 254 /* Schedules, but does not emit, a single basic block. After scheduling, the
 255  * final tag and size of the block are known, which are necessary for branching
 256  * */
 257
 258 static midgard_bundle
 259 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 260 {
 261         int instructions_emitted = 0, packed_idx = 0;
 262         midgard_bundle bundle = { 0 };
 263
 264         midgard_instruction *scheduled[5] = { NULL };
 265
 266         uint8_t tag = ins->type;
 267
 268         /* Default to the instruction's tag */
 269         bundle.tag = tag;
 270
 271         switch (ins->type) {
 272         case TAG_ALU_4: {
 273                 uint32_t control = 0;
 274                 size_t bytes_emitted = sizeof(control);
 275
 276                 /* TODO: Constant combining */
 277                 int index = 0, last_unit = 0;
 278
 279                 /* Previous instructions, for the purpose of parallelism */
 280                 midgard_instruction *segment[4] = {0};
 281                 int segment_size = 0;
 282
 283                 instructions_emitted = -1;
 284                 midgard_instruction *pins = ins;
 285
 286                 unsigned constant_count = 0;
 287
 288                 for (;;) {
 289                         midgard_instruction *ains = pins;
 290
 291                         /* Advance instruction pointer */
 292                         if (index) {
 293                                 ains = mir_next_op(pins);
 294                                 pins = ains;
 295                         }
 296
 297                         /* Out-of-work condition */
 298                         if ((struct list_head *) ains == &block->instructions)
 299                                 break;
 300
 301                         /* Ensure that the chain can continue */
 302                         if (ains->type != TAG_ALU_4) break;
 303
 304                         /* If there's already something in the bundle and we
 305                          * have weird scheduler constraints, break now */
 306                         if (ains->precede_break && index) break;
 307
 308                         /* According to the presentation "The ARM
 309                          * Mali-T880 Mobile GPU" from HotChips 27,
 310                          * there are two pipeline stages. Branching
 311                          * position determined experimentally. Lines
 312                          * are executed in parallel:
 313                          *
 314                          * [ VMUL ] [ SADD ]
 315                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 316                          *
 317                          * Verify that there are no ordering dependencies here.
 318                          *
 319                          * TODO: Allow for parallelism!!!
 320                          */
 321
 322                         /* Pick a unit for it if it doesn't force a particular unit */
 323
 324                         int unit = ains->unit;
 325
 326                         if (!unit) {
 327                                 int op = ains->alu.op;
 328                                 int units = alu_opcode_props[op].props;
 329                                 bool scalar = mir_is_scalar(ains);
 330
 331                                 if (!scalar) {
 332                                         if (last_unit >= UNIT_VADD) {
 333                                                 if (units & UNIT_VLUT)
 334                                                         unit = UNIT_VLUT;
 335                                                 else
 336                                                         break;
 337                                         } else {
 338                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 339                                                         unit = UNIT_VMUL;
 340                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 341                                                         unit = UNIT_VADD;
 342                                                 else if (units & UNIT_VLUT)
 343                                                         unit = UNIT_VLUT;
 344                                                 else
 345                                                         break;
 346                                         }
 347                                 } else {
 348                                         if (last_unit >= UNIT_VADD) {
 349                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 350                                                         unit = UNIT_SMUL;
 351                                                 else if (units & UNIT_VLUT)
 352                                                         unit = UNIT_VLUT;
 353                                                 else
 354                                                         break;
 355                                         } else {
 356                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 357                                                         unit = UNIT_VMUL;
 358                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 359                                                         unit = UNIT_SADD;
 360                                                 else if (units & UNIT_VADD)
 361                                                         unit = UNIT_VADD;
 362                                                 else if (units & UNIT_SMUL)
 363                                                         unit = UNIT_SMUL;
 364                                                 else if (units & UNIT_VLUT)
 365                                                         unit = UNIT_VLUT;
 366                                                 else
 367                                                         break;
 368                                         }
 369                                 }
 370
 371                                 assert(unit & units);
 372                         }
 373
 374                         /* Late unit check, this time for encoding (not parallelism) */
 375                         if (unit <= last_unit) break;
 376
 377                         /* Clear the segment */
 378                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 379                                 segment_size = 0;
 380
 381                         if (midgard_has_hazard(segment, segment_size, ains))
 382                                 break;
 383
 384                         /* We're good to go -- emit the instruction */
 385                         ains->unit = unit;
 386
 387                         segment[segment_size++] = ains;
 388
 389                         /* We try to reuse constants if possible, by adjusting
 390                          * the swizzle */
 391
 392                         if (ains->has_blend_constant) {
 393                                 /* Everything conflicts with the blend constant */
 394                                 if (bundle.has_embedded_constants)
 395                                         break;
 396
 397                                 bundle.has_blend_constant = 1;
 398                                 bundle.has_embedded_constants = 1;
 399                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 400                                 /* TODO: DRY with the analysis pass */
 401
 402                                 if (bundle.has_blend_constant)
 403                                         break;
 404
 405                                 if (constant_count)
 406                                         break;
 407
 408                                 /* TODO: Fix packing XXX */
 409                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 410                                 uint32_t *constants = (uint32_t *) ains->constants;
 411
 412                                 /* Copy them wholesale */
 413                                 for (unsigned i = 0; i < 4; ++i)
 414                                         bundles[i] = constants[i];
 415
 416                                 bundle.has_embedded_constants = true;
 417                                 constant_count = 4;
 418                         } else if (ains->has_constants) {
 419                                 /* By definition, blend constants conflict with
 420                                  * everything, so if there are already
 421                                  * constants we break the bundle *now* */
 422
 423                                 if (bundle.has_blend_constant)
 424                                         break;
 425
 426                                 /* For anything but blend constants, we can do
 427                                  * proper analysis, however */
 428
 429                                 /* TODO: Mask by which are used */
 430                                 uint32_t *constants = (uint32_t *) ains->constants;
 431                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 432
 433                                 uint32_t indices[4] = { 0 };
 434                                 bool break_bundle = false;
 435
 436                                 for (unsigned i = 0; i < 4; ++i) {
 437                                         uint32_t cons = constants[i];
 438                                         bool constant_found = false;
 439
 440                                         /* Search for the constant */
 441                                         for (unsigned j = 0; j < constant_count; ++j) {
 442                                                 if (bundles[j] != cons)
 443                                                         continue;
 444
 445                                                 /* We found it, reuse */
 446                                                 indices[i] = j;
 447                                                 constant_found = true;
 448                                                 break;
 449                                         }
 450
 451                                         if (constant_found)
 452                                                 continue;
 453
 454                                         /* We didn't find it, so allocate it */
 455                                         unsigned idx = constant_count++;
 456
 457                                         if (idx >= 4) {
 458                                                 /* Uh-oh, out of space */
 459                                                 break_bundle = true;
 460                                                 break;
 461                                         }
 462
 463                                         /* We have space, copy it in! */
 464                                         bundles[idx] = cons;
 465                                         indices[i] = idx;
 466                                 }
 467
 468                                 if (break_bundle)
 469                                         break;
 470
 471                                 /* Cool, we have it in. So use indices as a
 472                                  * swizzle */
 473
 474                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 475                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 476
 477                                 if (ains->src[0] == r_constant)
 478                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 479
 480                                 if (ains->src[1] == r_constant)
 481                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 482
 483                                 bundle.has_embedded_constants = true;
 484                         }
 485
 486                         if (ains->compact_branch) {
 487                                 /* All of r0 has to be written out along with
 488                                  * the branch writeout */
 489
 490                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count)) {
 491                                         /* We only work on full moves
 492                                          * at the beginning. We could
 493                                          * probably do better */
 494                                         if (index != 0)
 495                                                 break;
 496
 497                                         /* Inject a move */
 498                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 499                                         ins.unit = UNIT_VMUL;
 500                                         control |= ins.unit;
 501
 502                                         /* TODO don't leak */
 503                                         midgard_instruction *move =
 504                                                 mem_dup(&ins, sizeof(midgard_instruction));
 505                                         bytes_emitted += bytes_for_instruction(move);
 506                                         bundle.instructions[packed_idx++] = move;
 507                                 }
 508                         }
 509
 510                         bytes_emitted += bytes_for_instruction(ains);
 511
 512                         /* Defer marking until after writing to allow for break */
 513                         scheduled[index] = ains;
 514                         control |= ains->unit;
 515                         last_unit = ains->unit;
 516                         ++instructions_emitted;
 517                         ++index;
 518                 }
 519
 520                 int padding = 0;
 521
 522                 /* Pad ALU op to nearest word */
 523
 524                 if (bytes_emitted & 15) {
 525                         padding = 16 - (bytes_emitted & 15);
 526                         bytes_emitted += padding;
 527                 }
 528
 529                 /* Constants must always be quadwords */
 530                 if (bundle.has_embedded_constants)
 531                         bytes_emitted += 16;
 532
 533                 /* Size ALU instruction for tag */
 534                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 535                 bundle.padding = padding;
 536                 bundle.control = bundle.tag | control;
 537
 538                 break;
 539         }
 540
 541         case TAG_LOAD_STORE_4: {
 542                 /* Load store instructions have two words at once. If
 543                  * we only have one queued up, we need to NOP pad.
 544                  * Otherwise, we store both in succession to save space
 545                  * and cycles -- letting them go in parallel -- skip
 546                  * the next. The usefulness of this optimisation is
 547                  * greatly dependent on the quality of the instruction
 548                  * scheduler.
 549                  */
 550
 551                 midgard_instruction *next_op = mir_next_op(ins);
 552
 553                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 554                         /* TODO: Concurrency check */
 555                         instructions_emitted++;
 556                 }
 557
 558                 break;
 559         }
 560
 561         case TAG_TEXTURE_4: {
 562                 /* Which tag we use depends on the shader stage */
 563                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 564                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 565                 break;
 566         }
 567
 568         default:
 569                 unreachable("Unknown tag");
 570                 break;
 571         }
 572
 573         /* Copy the instructions into the bundle */
 574         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 575
 576         midgard_instruction *uins = ins;
 577         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 578                 assert(&uins->link != &block->instructions);
 579                 bundle.instructions[packed_idx] = uins;
 580                 uins = mir_next_op(uins);
 581         }
 582
 583         *skip = instructions_emitted;
 584
 585         return bundle;
 586 }
 587
 588 /* Schedule a single block by iterating its instruction to create bundles.
 589  * While we go, tally about the bundle sizes to compute the block size. */
 590
 591 static void
 592 schedule_block(compiler_context *ctx, midgard_block *block)
 593 {
 594         util_dynarray_init(&block->bundles, NULL);
 595
 596         block->quadword_count = 0;
 597
 598         mir_foreach_instr_in_block(block, ins) {
 599                 int skip;
 600                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
 601                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
 602
 603                 if (bundle.has_blend_constant) {
 604                         /* TODO: Multiblock? */
 605                         int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
 606                         ctx->blend_constant_offset = quadwords_within_block * 0x10;
 607                 }
 608
 609                 while(skip--)
 610                         ins = mir_next_op(ins);
 611
 612                 block->quadword_count += quadword_size(bundle.tag);
 613         }
 614
 615         block->is_scheduled = true;
 616 }
 617
 618 /* The following passes reorder MIR instructions to enable better scheduling */
 619
 620 static void
 621 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
 622 {
 623         mir_foreach_instr_in_block_safe(block, ins) {
 624                 if (ins->type != TAG_LOAD_STORE_4) continue;
 625
 626                 /* We've found a load/store op. Check if next is also load/store. */
 627                 midgard_instruction *next_op = mir_next_op(ins);
 628                 if (&next_op->link != &block->instructions) {
 629                         if (next_op->type == TAG_LOAD_STORE_4) {
 630                                 /* If so, we're done since we're a pair */
 631                                 ins = mir_next_op(ins);
 632                                 continue;
 633                         }
 634
 635                         /* Maximum search distance to pair, to avoid register pressure disasters */
 636                         int search_distance = 8;
 637
 638                         /* Otherwise, we have an orphaned load/store -- search for another load */
 639                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
 640                                 /* Terminate search if necessary */
 641                                 if (!(search_distance--)) break;
 642
 643                                 if (c->type != TAG_LOAD_STORE_4) continue;
 644
 645                                 /* We can only reorder if there are no sources */
 646
 647                                 bool deps = false;
 648
 649                                 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
 650                                         deps |= (c->src[s] != ~0);
 651
 652                                 if (deps)
 653                                         continue;
 654
 655                                 /* We found one! Move it up to pair and remove it from the old location */
 656
 657                                 mir_insert_instruction_before(ctx, ins, *c);
 658                                 mir_remove_instruction(c);
 659
 660                                 break;
 661                         }
 662                 }
 663         }
 664 }
 665
 666 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 667  * as such */
 668
 669 static unsigned
 670 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 671 {
 672         if (hash >= SSA_FIXED_MINIMUM)
 673                 return hash;
 674
 675         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
 676                                 ctx->hash_to_temp, hash + 1);
 677
 678         if (temp)
 679                 return temp - 1;
 680
 681         /* If no temp is find, allocate one */
 682         temp = ctx->temp_count++;
 683         ctx->max_hash = MAX2(ctx->max_hash, hash);
 684
 685         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
 686                                     hash + 1, (void *) ((uintptr_t) temp + 1));
 687
 688         return temp;
 689 }
 690
 691 /* Reassigns numbering to get rid of gaps in the indices */
 692
 693 static void
 694 mir_squeeze_index(compiler_context *ctx)
 695 {
 696         /* Reset */
 697         ctx->temp_count = 0;
 698         /* TODO don't leak old hash_to_temp */
 699         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
 700
 701         mir_foreach_instr_global(ctx, ins) {
 702                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
 703
 704                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
 705                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
 706         }
 707 }
 708
 709 static midgard_instruction
 710 v_load_store_scratch(
 711                 unsigned srcdest,
 712                 unsigned index,
 713                 bool is_store,
 714                 unsigned mask)
 715 {
 716         /* We index by 32-bit vec4s */
 717         unsigned byte = (index * 4 * 4);
 718
 719         midgard_instruction ins = {
 720                 .type = TAG_LOAD_STORE_4,
 721                 .mask = mask,
 722                 .dest = ~0,
 723                 .src = { ~0, ~0, ~0 },
 724                 .load_store = {
 725                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
 726                         .swizzle = SWIZZLE_XYZW,
 727
 728                         /* For register spilling - to thread local storage */
 729                         .arg_1 = 0xEA,
 730                         .arg_2 = 0x1E,
 731
 732                         /* Splattered across, TODO combine logically */
 733                         .varying_parameters = (byte & 0x1FF) << 1,
 734                         .address = (byte >> 9)
 735                 },
 736
 737                 /* If we spill an unspill, RA goes into an infinite loop */
 738                 .no_spill = true
 739         };
 740
 741        if (is_store) {
 742                 /* r0 = r26, r1 = r27 */
 743                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
 744                 ins.src[0] = srcdest;
 745         } else {
 746                 ins.dest = srcdest;
 747         }
 748
 749         return ins;
 750 }
 751
 752 /* If register allocation fails, find the best spill node and spill it to fix
 753  * whatever the issue was. This spill node could be a work register (spilling
 754  * to thread local storage), but it could also simply be a special register
 755  * that needs to spill to become a work register. */
 756
 757 static void mir_spill_register(
 758                 compiler_context *ctx,
 759                 struct ra_graph *g,
 760                 unsigned *spill_count)
 761 {
 762         unsigned spill_index = ctx->temp_count;
 763
 764         /* Our first step is to calculate spill cost to figure out the best
 765          * spill node. All nodes are equal in spill cost, but we can't spill
 766          * nodes written to from an unspill */
 767
 768         for (unsigned i = 0; i < ctx->temp_count; ++i) {
 769                 ra_set_node_spill_cost(g, i, 1.0);
 770         }
 771
 772         mir_foreach_instr_global(ctx, ins) {
 773                 if (ins->no_spill &&
 774                     ins->dest >= 0 &&
 775                     ins->dest < ctx->temp_count)
 776                         ra_set_node_spill_cost(g, ins->dest, -1.0);
 777         }
 778
 779         int spill_node = ra_get_best_spill_node(g);
 780
 781         if (spill_node < 0) {
 782                 mir_print_shader(ctx);
 783                 assert(0);
 784         }
 785
 786         /* We have a spill node, so check the class. Work registers
 787          * legitimately spill to TLS, but special registers just spill to work
 788          * registers */
 789
 790         unsigned class = ra_get_node_class(g, spill_node);
 791         bool is_special = (class >> 2) != REG_CLASS_WORK;
 792         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
 793
 794         /* Allocate TLS slot (maybe) */
 795         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
 796
 797         /* For TLS, replace all stores to the spilled node. For
 798          * special reads, just keep as-is; the class will be demoted
 799          * implicitly. For special writes, spill to a work register */
 800
 801         if (!is_special || is_special_w) {
 802                 if (is_special_w)
 803                         spill_slot = spill_index++;
 804
 805                 mir_foreach_instr_global_safe(ctx, ins) {
 806                         if (ins->dest != spill_node) continue;
 807
 808                         midgard_instruction st;
 809
 810                         if (is_special_w) {
 811                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
 812                                 st.no_spill = true;
 813                         } else {
 814                                 ins->dest = SSA_FIXED_REGISTER(26);
 815                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
 816                         }
 817
 818                         /* Hint: don't rewrite this node */
 819                         st.hint = true;
 820
 821                         mir_insert_instruction_before(ctx, mir_next_op(ins), st);
 822
 823                         if (!is_special)
 824                                 ctx->spills++;
 825                 }
 826         }
 827
 828         /* For special reads, figure out how many components we need */
 829         unsigned read_mask = 0;
 830
 831         mir_foreach_instr_global_safe(ctx, ins) {
 832                 read_mask |= mir_mask_of_read_components(ins, spill_node);
 833         }
 834
 835         /* Insert a load from TLS before the first consecutive
 836          * use of the node, rewriting to use spilled indices to
 837          * break up the live range. Or, for special, insert a
 838          * move. Ironically the latter *increases* register
 839          * pressure, but the two uses of the spilling mechanism
 840          * are somewhat orthogonal. (special spilling is to use
 841          * work registers to back special registers; TLS
 842          * spilling is to use memory to back work registers) */
 843
 844         mir_foreach_block(ctx, block) {
 845                 bool consecutive_skip = false;
 846                 unsigned consecutive_index = 0;
 847
 848                 mir_foreach_instr_in_block(block, ins) {
 849                         /* We can't rewrite the moves used to spill in the
 850                          * first place. These moves are hinted. */
 851                         if (ins->hint) continue;
 852
 853                         if (!mir_has_arg(ins, spill_node)) {
 854                                 consecutive_skip = false;
 855                                 continue;
 856                         }
 857
 858                         if (consecutive_skip) {
 859                                 /* Rewrite */
 860                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
 861                                 continue;
 862                         }
 863
 864                         if (!is_special_w) {
 865                                 consecutive_index = ++spill_index;
 866
 867                                 midgard_instruction *before = ins;
 868
 869                                 /* For a csel, go back one more not to break up the bundle */
 870                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
 871                                         before = mir_prev_op(before);
 872
 873                                 midgard_instruction st;
 874
 875                                 if (is_special) {
 876                                         /* Move */
 877                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
 878                                         st.no_spill = true;
 879                                 } else {
 880                                         /* TLS load */
 881                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
 882                                 }
 883
 884                                 /* Mask the load based on the component count
 885                                  * actually needed to prvent RA loops */
 886
 887                                 st.mask = read_mask;
 888
 889                                 mir_insert_instruction_before(ctx, before, st);
 890                                // consecutive_skip = true;
 891                         } else {
 892                                 /* Special writes already have their move spilled in */
 893                                 consecutive_index = spill_slot;
 894                         }
 895
 896
 897                         /* Rewrite to use */
 898                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
 899
 900                         if (!is_special)
 901                                 ctx->fills++;
 902                 }
 903         }
 904
 905         /* Reset hints */
 906
 907         mir_foreach_instr_global(ctx, ins) {
 908                 ins->hint = false;
 909         }
 910 }
 911
 912 void
 913 schedule_program(compiler_context *ctx)
 914 {
 915         struct ra_graph *g = NULL;
 916         bool spilled = false;
 917         int iter_count = 1000; /* max iterations */
 918
 919         /* Number of 128-bit slots in memory we've spilled into */
 920         unsigned spill_count = 0;
 921
 922         midgard_promote_uniforms(ctx, 16);
 923
 924         mir_foreach_block(ctx, block) {
 925                 midgard_pair_load_store(ctx, block);
 926         }
 927
 928         /* Must be lowered right before RA */
 929         mir_squeeze_index(ctx);
 930         mir_lower_special_reads(ctx);
 931
 932         /* Lowering can introduce some dead moves */
 933
 934         mir_foreach_block(ctx, block) {
 935                 midgard_opt_dead_move_eliminate(ctx, block);
 936         }
 937
 938         do {
 939                 if (spilled)
 940                         mir_spill_register(ctx, g, &spill_count);
 941
 942                 mir_squeeze_index(ctx);
 943
 944                 g = NULL;
 945                 g = allocate_registers(ctx, &spilled);
 946         } while(spilled && ((iter_count--) > 0));
 947
 948         /* We can simplify a bit after RA */
 949
 950         mir_foreach_block(ctx, block) {
 951                 midgard_opt_post_move_eliminate(ctx, block, g);
 952         }
 953
 954         /* After RA finishes, we schedule all at once */
 955
 956         mir_foreach_block(ctx, block) {
 957                 schedule_block(ctx, block);
 958         }
 959
 960         /* Finally, we create pipeline registers as a peephole pass after
 961          * scheduling. This isn't totally optimal, since there are cases where
 962          * the usage of pipeline registers can eliminate spills, but it does
 963          * save some power */
 964
 965         mir_create_pipeline_registers(ctx);
 966
 967         if (iter_count <= 0) {
 968                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
 969                 assert(0);
 970         }
 971
 972         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
 973          * fp32), but tls_size is in bytes, so multiply by 16 */
 974
 975         ctx->tls_size = spill_count * 16;
 976
 977         install_registers(ctx, g);
 978 }