src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
  30  * must be grouped into VLIW bundles according to following model:
  31  *
  32  * [VMUL] [SADD]
  33  * [VADD] [SMUL] [VLUT]
  34  *
  35  * A given instruction can execute on some subset of the units (or a few can
  36  * execute on all). Instructions can be either vector or scalar; only scalar
  37  * instructions can execute on SADD/SMUL units. Units on a given line execute
  38  * in parallel. Subsequent lines execute separately and can pass results
  39  * directly via pipeline registers r24/r25, bypassing the register file.
  40  *
  41  * A bundle can optionally have 128-bits of embedded constants, shared across
  42  * all of the instructions within a bundle.
  43  *
  44  * Instructions consuming conditionals (branches and conditional selects)
  45  * require their condition to be written into the conditional register (r31)
  46  * within the same bundle they are consumed.
  47  *
  48  * Fragment writeout requires its argument to be written in full within the
  49  * same bundle as the branch, with no hanging dependencies.
  50  *
  51  * Load/store instructions are also in bundles of simply two instructions, and
  52  * texture instructions have no bundling.
  53  *
  54  * -------------------------------------------------------------------------
  55  *
  56  */
  57
  58 /* We create the dependency graph with per-component granularity */
  59
  60 #define COMPONENT_COUNT 8
  61
  62 static void
  63 add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child)
  64 {
  65         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  66                 if (!(mask & (1 << i)))
  67                         continue;
  68
  69                 struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i];
  70
  71                 util_dynarray_foreach(parents, unsigned, parent) {
  72                         BITSET_WORD *dependents = instructions[*parent]->dependents;
  73
  74                         /* Already have the dependency */
  75                         if (BITSET_TEST(dependents, child))
  76                                 continue;
  77
  78                         BITSET_SET(dependents, child);
  79                         instructions[child]->nr_dependencies++;
  80                 }
  81         }
  82 }
  83
  84 static void
  85 mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent)
  86 {
  87         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  88                 if (!(mask & (1 << i)))
  89                         continue;
  90
  91                 util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent);
  92         }
  93 }
  94
  95 static void
  96 mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)
  97 {
  98         size_t sz = node_count * COMPONENT_COUNT;
  99
 100         struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
 101         struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
 102
 103         for (unsigned i = 0; i < sz; ++i) {
 104                 util_dynarray_init(&last_read[i], NULL);
 105                 util_dynarray_init(&last_write[i], NULL);
 106         }
 107
 108         /* Initialize dependency graph */
 109         for (unsigned i = 0; i < count; ++i) {
 110                 instructions[i]->dependents =
 111                         calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));
 112
 113                 instructions[i]->nr_dependencies = 0;
 114         }
 115
 116         /* Populate dependency graph */
 117         for (signed i = count - 1; i >= 0; --i) {
 118                 if (instructions[i]->compact_branch)
 119                         continue;
 120
 121                 unsigned dest = instructions[i]->dest;
 122                 unsigned mask = instructions[i]->mask;
 123
 124                 mir_foreach_src((*instructions), s) {
 125                         unsigned src = instructions[i]->src[s];
 126
 127                         if (src < node_count) {
 128                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 129                                 add_dependency(last_write, src, readmask, instructions, i);
 130                         }
 131                 }
 132
 133                 if (dest < node_count) {
 134                         add_dependency(last_read, dest, mask, instructions, i);
 135                         add_dependency(last_write, dest, mask, instructions, i);
 136                         mark_access(last_write, dest, mask, i);
 137                 }
 138
 139                 mir_foreach_src((*instructions), s) {
 140                         unsigned src = instructions[i]->src[s];
 141
 142                         if (src < node_count) {
 143                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 144                                 mark_access(last_read, src, readmask, i);
 145                         }
 146                 }
 147         }
 148
 149         /* If there is a branch, all instructions depend on it, as interblock
 150          * execution must be purely in-order */
 151
 152         if (instructions[count - 1]->compact_branch) {
 153                 BITSET_WORD *dependents = instructions[count - 1]->dependents;
 154
 155                 for (signed i = count - 2; i >= 0; --i) {
 156                         if (BITSET_TEST(dependents, i))
 157                                 continue;
 158
 159                         BITSET_SET(dependents, i);
 160                         instructions[i]->nr_dependencies++;
 161                 }
 162         }
 163
 164         /* Free the intermediate structures */
 165         for (unsigned i = 0; i < sz; ++i) {
 166                 util_dynarray_fini(&last_read[i]);
 167                 util_dynarray_fini(&last_write[i]);
 168         }
 169 }
 170
 171 /* Create a mask of accessed components from a swizzle to figure out vector
 172  * dependencies */
 173
 174 static unsigned
 175 swizzle_to_access_mask(unsigned swizzle)
 176 {
 177         unsigned component_mask = 0;
 178
 179         for (int i = 0; i < 4; ++i) {
 180                 unsigned c = (swizzle >> (2 * i)) & 3;
 181                 component_mask |= (1 << c);
 182         }
 183
 184         return component_mask;
 185 }
 186
 187 /* Does the mask cover more than a scalar? */
 188
 189 static bool
 190 is_single_component_mask(unsigned mask)
 191 {
 192         int components = 0;
 193
 194         for (int c = 0; c < 8; ++c) {
 195                 if (mask & (1 << c))
 196                         components++;
 197         }
 198
 199         return components == 1;
 200 }
 201
 202 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
 203  * mind that we are a vector architecture and we can write to different
 204  * components simultaneously */
 205
 206 static bool
 207 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
 208 {
 209         /* Writeout has its own rules anyway */
 210         if (first->compact_branch || second->compact_branch)
 211                 return true;
 212
 213         /* Each instruction reads some registers and writes to a register. See
 214          * where the first writes */
 215
 216         int source = first->dest;
 217         int source_mask = first->mask;
 218
 219         /* As long as the second doesn't read from the first, we're okay */
 220         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
 221                 if (second->src[i] != source)
 222                         continue;
 223
 224                 if (first->type != TAG_ALU_4)
 225                         return false;
 226
 227                 /* Figure out which components we just read from */
 228
 229                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
 230                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
 231
 232                 /* Check if there are components in common, and fail if so */
 233                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
 234                         return false;
 235         }
 236
 237         /* Otherwise, it's safe in that regard. Another data hazard is both
 238          * writing to the same place, of course */
 239
 240         if (second->dest == source) {
 241                 /* ...but only if the components overlap */
 242
 243                 if (second->mask & source_mask)
 244                         return false;
 245         }
 246
 247         /* ...That's it */
 248         return true;
 249 }
 250
 251 static bool
 252 midgard_has_hazard(
 253         midgard_instruction **segment, unsigned segment_size,
 254         midgard_instruction *ains)
 255 {
 256         for (int s = 0; s < segment_size; ++s)
 257                 if (!can_run_concurrent_ssa(segment[s], ains))
 258                         return true;
 259
 260         return false;
 261
 262
 263 }
 264
 265 /* Fragment writeout (of r0) is allowed when:
 266  *
 267  *  - All components of r0 are written in the bundle
 268  *  - No components of r0 are written in VLUT
 269  *  - Non-pipelined dependencies of r0 are not written in the bundle
 270  *
 271  * This function checks if these requirements are satisfied given the content
 272  * of a scheduled bundle.
 273  */
 274
 275 static bool
 276 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count, unsigned r0)
 277 {
 278         /* First scan for which components of r0 are written out. Initially
 279          * none are written */
 280
 281         uint8_t r0_written_mask = 0x0;
 282
 283         /* Simultaneously we scan for the set of dependencies */
 284
 285         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 286         BITSET_WORD *dependencies = calloc(1, sz);
 287         memset(dependencies, 0, sz);
 288
 289         bool success = false;
 290
 291         for (unsigned i = 0; i < count; ++i) {
 292                 midgard_instruction *ins = bundle[i];
 293
 294                 if (ins->dest != r0)
 295                         continue;
 296
 297                 /* Record written out mask */
 298                 r0_written_mask |= ins->mask;
 299
 300                 /* Record dependencies, but only if they won't become pipeline
 301                  * registers. We know we can't be live after this, because
 302                  * we're writeout at the very end of the shader. So check if
 303                  * they were written before us. */
 304
 305                 unsigned src0 = ins->src[0];
 306                 unsigned src1 = ins->src[1];
 307
 308                 if (!mir_is_written_before(ctx, bundle[0], src0))
 309                         src0 = ~0;
 310
 311                 if (!mir_is_written_before(ctx, bundle[0], src1))
 312                         src1 = ~0;
 313
 314                 if (src0 < node_count)
 315                         BITSET_SET(dependencies, src0);
 316
 317                 if (src1 < node_count)
 318                         BITSET_SET(dependencies, src1);
 319
 320                 /* Requirement 2 */
 321                 if (ins->unit == UNIT_VLUT)
 322                         goto done;
 323         }
 324
 325         /* Requirement 1 */
 326         if ((r0_written_mask & 0xF) != 0xF)
 327                 goto done;
 328
 329         /* Requirement 3 */
 330
 331         for (unsigned i = 0; i < count; ++i) {
 332                 unsigned dest = bundle[i]->dest;
 333
 334                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 335                         goto done;
 336         }
 337
 338         /* Otherwise, we're good to go */
 339         success = true;
 340
 341 done:
 342         free(dependencies);
 343         return success;
 344 }
 345
 346 /* Helpers for scheudling */
 347
 348 static bool
 349 mir_is_scalar(midgard_instruction *ains)
 350 {
 351         /* Does the op support scalar units? */
 352         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 353                 return false;
 354
 355         /* Do we try to use it as a vector op? */
 356         if (!is_single_component_mask(ains->mask))
 357                 return false;
 358
 359         /* Otherwise, check mode hazards */
 360         bool could_scalar = true;
 361
 362         /* Only 16/32-bit can run on a scalar unit */
 363         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 364         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 365         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 366
 367         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 368                 /* If we're running in 16-bit mode, we
 369                  * can't have any 8-bit sources on the
 370                  * scalar unit (since the scalar unit
 371                  * doesn't understand 8-bit) */
 372
 373                 midgard_vector_alu_src s1 =
 374                         vector_alu_from_unsigned(ains->alu.src1);
 375
 376                 could_scalar &= !s1.half;
 377
 378                 midgard_vector_alu_src s2 =
 379                         vector_alu_from_unsigned(ains->alu.src2);
 380
 381                 could_scalar &= !s2.half;
 382         }
 383
 384         return could_scalar;
 385 }
 386
 387 /* How many bytes does this ALU instruction add to the bundle? */
 388
 389 static unsigned
 390 bytes_for_instruction(midgard_instruction *ains)
 391 {
 392         if (ains->unit & UNITS_ANY_VECTOR)
 393                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 394         else if (ains->unit == ALU_ENAB_BRANCH)
 395                 return sizeof(midgard_branch_extended);
 396         else if (ains->compact_branch)
 397                 return sizeof(ains->br_compact);
 398         else
 399                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 400 }
 401
 402 /* Schedules, but does not emit, a single basic block. After scheduling, the
 403  * final tag and size of the block are known, which are necessary for branching
 404  * */
 405
 406 static midgard_bundle
 407 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 408 {
 409         int instructions_emitted = 0, packed_idx = 0;
 410         midgard_bundle bundle = { 0 };
 411
 412         midgard_instruction *scheduled[5] = { NULL };
 413
 414         uint8_t tag = ins->type;
 415
 416         /* Default to the instruction's tag */
 417         bundle.tag = tag;
 418
 419         switch (ins->type) {
 420         case TAG_ALU_4: {
 421                 uint32_t control = 0;
 422                 size_t bytes_emitted = sizeof(control);
 423
 424                 /* TODO: Constant combining */
 425                 int index = 0, last_unit = 0;
 426
 427                 /* Previous instructions, for the purpose of parallelism */
 428                 midgard_instruction *segment[4] = {0};
 429                 int segment_size = 0;
 430
 431                 instructions_emitted = -1;
 432                 midgard_instruction *pins = ins;
 433
 434                 unsigned constant_count = 0;
 435
 436                 for (;;) {
 437                         midgard_instruction *ains = pins;
 438
 439                         /* Advance instruction pointer */
 440                         if (index) {
 441                                 ains = mir_next_op(pins);
 442                                 pins = ains;
 443                         }
 444
 445                         /* Out-of-work condition */
 446                         if ((struct list_head *) ains == &block->instructions)
 447                                 break;
 448
 449                         /* Ensure that the chain can continue */
 450                         if (ains->type != TAG_ALU_4) break;
 451
 452                         /* If there's already something in the bundle and we
 453                          * have weird scheduler constraints, break now */
 454                         if (ains->precede_break && index) break;
 455
 456                         /* According to the presentation "The ARM
 457                          * Mali-T880 Mobile GPU" from HotChips 27,
 458                          * there are two pipeline stages. Branching
 459                          * position determined experimentally. Lines
 460                          * are executed in parallel:
 461                          *
 462                          * [ VMUL ] [ SADD ]
 463                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 464                          *
 465                          * Verify that there are no ordering dependencies here.
 466                          *
 467                          * TODO: Allow for parallelism!!!
 468                          */
 469
 470                         /* Pick a unit for it if it doesn't force a particular unit */
 471
 472                         int unit = ains->unit;
 473
 474                         if (!unit) {
 475                                 int op = ains->alu.op;
 476                                 int units = alu_opcode_props[op].props;
 477                                 bool scalar = mir_is_scalar(ains);
 478
 479                                 if (!scalar) {
 480                                         if (last_unit >= UNIT_VADD) {
 481                                                 if (units & UNIT_VLUT)
 482                                                         unit = UNIT_VLUT;
 483                                                 else
 484                                                         break;
 485                                         } else {
 486                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 487                                                         unit = UNIT_VMUL;
 488                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 489                                                         unit = UNIT_VADD;
 490                                                 else if (units & UNIT_VLUT)
 491                                                         unit = UNIT_VLUT;
 492                                                 else
 493                                                         break;
 494                                         }
 495                                 } else {
 496                                         if (last_unit >= UNIT_VADD) {
 497                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 498                                                         unit = UNIT_SMUL;
 499                                                 else if (units & UNIT_VLUT)
 500                                                         unit = UNIT_VLUT;
 501                                                 else
 502                                                         break;
 503                                         } else {
 504                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 505                                                         unit = UNIT_VMUL;
 506                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 507                                                         unit = UNIT_SADD;
 508                                                 else if (units & UNIT_VADD)
 509                                                         unit = UNIT_VADD;
 510                                                 else if (units & UNIT_SMUL)
 511                                                         unit = UNIT_SMUL;
 512                                                 else if (units & UNIT_VLUT)
 513                                                         unit = UNIT_VLUT;
 514                                                 else
 515                                                         break;
 516                                         }
 517                                 }
 518
 519                                 assert(unit & units);
 520                         }
 521
 522                         /* Late unit check, this time for encoding (not parallelism) */
 523                         if (unit <= last_unit) break;
 524
 525                         /* Clear the segment */
 526                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 527                                 segment_size = 0;
 528
 529                         if (midgard_has_hazard(segment, segment_size, ains))
 530                                 break;
 531
 532                         /* We're good to go -- emit the instruction */
 533                         ains->unit = unit;
 534
 535                         segment[segment_size++] = ains;
 536
 537                         /* We try to reuse constants if possible, by adjusting
 538                          * the swizzle */
 539
 540                         if (ains->has_blend_constant) {
 541                                 /* Everything conflicts with the blend constant */
 542                                 if (bundle.has_embedded_constants)
 543                                         break;
 544
 545                                 bundle.has_blend_constant = 1;
 546                                 bundle.has_embedded_constants = 1;
 547                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 548                                 /* TODO: DRY with the analysis pass */
 549
 550                                 if (bundle.has_blend_constant)
 551                                         break;
 552
 553                                 if (constant_count)
 554                                         break;
 555
 556                                 /* TODO: Fix packing XXX */
 557                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 558                                 uint32_t *constants = (uint32_t *) ains->constants;
 559
 560                                 /* Copy them wholesale */
 561                                 for (unsigned i = 0; i < 4; ++i)
 562                                         bundles[i] = constants[i];
 563
 564                                 bundle.has_embedded_constants = true;
 565                                 constant_count = 4;
 566                         } else if (ains->has_constants) {
 567                                 /* By definition, blend constants conflict with
 568                                  * everything, so if there are already
 569                                  * constants we break the bundle *now* */
 570
 571                                 if (bundle.has_blend_constant)
 572                                         break;
 573
 574                                 /* For anything but blend constants, we can do
 575                                  * proper analysis, however */
 576
 577                                 /* TODO: Mask by which are used */
 578                                 uint32_t *constants = (uint32_t *) ains->constants;
 579                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 580
 581                                 uint32_t indices[4] = { 0 };
 582                                 bool break_bundle = false;
 583
 584                                 for (unsigned i = 0; i < 4; ++i) {
 585                                         uint32_t cons = constants[i];
 586                                         bool constant_found = false;
 587
 588                                         /* Search for the constant */
 589                                         for (unsigned j = 0; j < constant_count; ++j) {
 590                                                 if (bundles[j] != cons)
 591                                                         continue;
 592
 593                                                 /* We found it, reuse */
 594                                                 indices[i] = j;
 595                                                 constant_found = true;
 596                                                 break;
 597                                         }
 598
 599                                         if (constant_found)
 600                                                 continue;
 601
 602                                         /* We didn't find it, so allocate it */
 603                                         unsigned idx = constant_count++;
 604
 605                                         if (idx >= 4) {
 606                                                 /* Uh-oh, out of space */
 607                                                 break_bundle = true;
 608                                                 break;
 609                                         }
 610
 611                                         /* We have space, copy it in! */
 612                                         bundles[idx] = cons;
 613                                         indices[i] = idx;
 614                                 }
 615
 616                                 if (break_bundle)
 617                                         break;
 618
 619                                 /* Cool, we have it in. So use indices as a
 620                                  * swizzle */
 621
 622                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 623                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 624
 625                                 if (ains->src[0] == r_constant)
 626                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 627
 628                                 if (ains->src[1] == r_constant)
 629                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 630
 631                                 bundle.has_embedded_constants = true;
 632                         }
 633
 634                         if (ains->compact_branch) {
 635                                 /* All of r0 has to be written out along with
 636                                  * the branch writeout */
 637
 638                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count, ains->src[0])) {
 639                                         /* We only work on full moves
 640                                          * at the beginning. We could
 641                                          * probably do better */
 642                                         if (index != 0)
 643                                                 break;
 644
 645                                         /* Inject a move */
 646                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 647                                         ins.unit = UNIT_VMUL;
 648                                         control |= ins.unit;
 649
 650                                         /* TODO don't leak */
 651                                         midgard_instruction *move =
 652                                                 mem_dup(&ins, sizeof(midgard_instruction));
 653                                         bytes_emitted += bytes_for_instruction(move);
 654                                         bundle.instructions[packed_idx++] = move;
 655                                 }
 656                         }
 657
 658                         bytes_emitted += bytes_for_instruction(ains);
 659
 660                         /* Defer marking until after writing to allow for break */
 661                         scheduled[index] = ains;
 662                         control |= ains->unit;
 663                         last_unit = ains->unit;
 664                         ++instructions_emitted;
 665                         ++index;
 666                 }
 667
 668                 int padding = 0;
 669
 670                 /* Pad ALU op to nearest word */
 671
 672                 if (bytes_emitted & 15) {
 673                         padding = 16 - (bytes_emitted & 15);
 674                         bytes_emitted += padding;
 675                 }
 676
 677                 /* Constants must always be quadwords */
 678                 if (bundle.has_embedded_constants)
 679                         bytes_emitted += 16;
 680
 681                 /* Size ALU instruction for tag */
 682                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 683                 bundle.padding = padding;
 684                 bundle.control = bundle.tag | control;
 685
 686                 break;
 687         }
 688
 689         case TAG_LOAD_STORE_4: {
 690                 /* Load store instructions have two words at once. If
 691                  * we only have one queued up, we need to NOP pad.
 692                  * Otherwise, we store both in succession to save space
 693                  * and cycles -- letting them go in parallel -- skip
 694                  * the next. The usefulness of this optimisation is
 695                  * greatly dependent on the quality of the instruction
 696                  * scheduler.
 697                  */
 698
 699                 midgard_instruction *next_op = mir_next_op(ins);
 700
 701                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 702                         /* TODO: Concurrency check */
 703                         instructions_emitted++;
 704                 }
 705
 706                 break;
 707         }
 708
 709         case TAG_TEXTURE_4: {
 710                 /* Which tag we use depends on the shader stage */
 711                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 712                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 713                 break;
 714         }
 715
 716         default:
 717                 unreachable("Unknown tag");
 718                 break;
 719         }
 720
 721         /* Copy the instructions into the bundle */
 722         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 723
 724         midgard_instruction *uins = ins;
 725         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 726                 assert(&uins->link != &block->instructions);
 727                 bundle.instructions[packed_idx] = uins;
 728                 uins = mir_next_op(uins);
 729         }
 730
 731         *skip = instructions_emitted;
 732
 733         return bundle;
 734 }
 735
 736 /* We would like to flatten the linked list of midgard_instructions in a bundle
 737  * to an array of pointers on the heap for easy indexing */
 738
 739 static midgard_instruction **
 740 flatten_mir(midgard_block *block, unsigned *len)
 741 {
 742         *len = list_length(&block->instructions);
 743
 744         if (!(*len))
 745                 return NULL;
 746
 747         midgard_instruction **instructions =
 748                 calloc(sizeof(midgard_instruction *), *len);
 749
 750         unsigned i = 0;
 751
 752         mir_foreach_instr_in_block(block, ins)
 753                 instructions[i++] = ins;
 754
 755         return instructions;
 756 }
 757
 758 /* The worklist is the set of instructions that can be scheduled now; that is,
 759  * the set of instructions with no remaining dependencies */
 760
 761 static void
 762 mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count)
 763 {
 764         for (unsigned i = 0; i < count; ++i) {
 765                 if (instructions[i]->nr_dependencies == 0)
 766                         BITSET_SET(worklist, i);
 767         }
 768 }
 769
 770 /* Update the worklist after an instruction terminates. Remove its edges from
 771  * the graph and if that causes any node to have no dependencies, add it to the
 772  * worklist */
 773
 774 static void
 775 mir_update_worklist(
 776                 BITSET_WORD *worklist, unsigned count,
 777                 midgard_instruction **instructions, midgard_instruction *done)
 778 {
 779         /* Sanity check: if no instruction terminated, there is nothing to do.
 780          * If the instruction that terminated had dependencies, that makes no
 781          * sense and means we messed up the worklist. Finally, as the purpose
 782          * of this routine is to update dependents, we abort early if there are
 783          * no dependents defined. */
 784
 785         if (!done)
 786                 return;
 787
 788         assert(done->nr_dependencies == 0);
 789
 790         if (!done->dependents)
 791                 return;
 792
 793         /* We have an instruction with dependents. Iterate each dependent to
 794          * remove one dependency (`done`), adding dependents to the worklist
 795          * where possible. */
 796
 797         unsigned i;
 798         BITSET_WORD tmp;
 799         BITSET_FOREACH_SET(i, tmp, done->dependents, count) {
 800                 assert(instructions[i]->nr_dependencies);
 801
 802                 if (!(--instructions[i]->nr_dependencies))
 803                         BITSET_SET(worklist, i);
 804         }
 805
 806         free(done->dependents);
 807 }
 808
 809 /* While scheduling, we need to choose instructions satisfying certain
 810  * criteria. As we schedule backwards, we choose the *last* instruction in the
 811  * worklist to simulate in-order scheduling. Chosen instructions must satisfy a
 812  * given predicate. */
 813
 814 struct midgard_predicate {
 815         /* TAG or ~0 for dont-care */
 816         unsigned tag;
 817
 818         /* True if we want to pop off the chosen instruction */
 819         bool destructive;
 820
 821         /* State for bundle constants. constants is the actual constants
 822          * for the bundle. constant_count is the number of bytes (up to
 823          * 16) currently in use for constants. When picking in destructive
 824          * mode, the constants array will be updated, and the instruction
 825          * will be adjusted to index into the constants array */
 826
 827         uint8_t *constants;
 828         unsigned constant_count;
 829         bool blend_constant;
 830
 831         /* Exclude this destination (if not ~0) */
 832         unsigned exclude;
 833 };
 834
 835 /* For an instruction that can fit, adjust it to fit and update the constants
 836  * array, in destructive mode. Returns whether the fitting was successful. */
 837
 838 static bool
 839 mir_adjust_constants(midgard_instruction *ins,
 840                 struct midgard_predicate *pred,
 841                 bool destructive)
 842 {
 843         /* Blend constants dominate */
 844         if (ins->has_blend_constant) {
 845                 if (pred->constant_count)
 846                         return false;
 847                 else if (destructive) {
 848                         pred->blend_constant = true;
 849                         pred->constant_count = 16;
 850                         return true;
 851                 }
 852         }
 853
 854         /* No constant, nothing to adjust */
 855         if (!ins->has_constants)
 856                 return true;
 857
 858         /* TODO: Deduplicate; permit multiple constants within a bundle */
 859
 860         if (destructive && !pred->constant_count) {
 861                 if (ins->alu.reg_mode == midgard_reg_mode_16) {
 862                       /* TODO: Fix packing XXX */
 863                         uint16_t *bundles = (uint16_t *) pred->constants;
 864                         uint32_t *constants = (uint32_t *) ins->constants;
 865
 866                         /* Copy them wholesale */
 867                         for (unsigned i = 0; i < 4; ++i)
 868                                 bundles[i] = constants[i];
 869                 } else {
 870                         memcpy(pred->constants, ins->constants, 16);
 871                 }
 872
 873                 pred->constant_count = 16;
 874                 return true;
 875         }
 876
 877         return !pred->constant_count;
 878 }
 879
 880 static midgard_instruction *
 881 mir_choose_instruction(
 882                 midgard_instruction **instructions,
 883                 BITSET_WORD *worklist, unsigned count,
 884                 struct midgard_predicate *predicate)
 885 {
 886         /* Parse the predicate */
 887         unsigned tag = predicate->tag;
 888         bool alu = tag == TAG_ALU_4;
 889         unsigned unit = predicate->unit;
 890         bool branch = alu && (unit == ALU_ENAB_BR_COMPACT);
 891
 892         /* Iterate to find the best instruction satisfying the predicate */
 893         unsigned i;
 894         BITSET_WORD tmp;
 895
 896         signed best_index = -1;
 897
 898         BITSET_FOREACH_SET(i, tmp, worklist, count) {
 899                 if (tag != ~0 && instructions[i]->type != tag)
 900                         continue;
 901
 902                 if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude)
 903                         continue;
 904
 905                 if (alu && !branch && !(alu_opcode_props[instructions[i]->alu.op].props & unit))
 906                         continue;
 907
 908                 if (branch && !instructions[i]->compact_branch)
 909                         continue;
 910
 911                 /* Simulate in-order scheduling */
 912                 if ((signed) i < best_index)
 913                         continue;
 914
 915                 best_index = i;
 916         }
 917
 918
 919         /* Did we find anything?  */
 920
 921         if (best_index < 0)
 922                 return NULL;
 923
 924         /* If we found something, remove it from the worklist */
 925         assert(best_index < count);
 926
 927         if (predicate->destructive) {
 928                 BITSET_CLEAR(worklist, best_index);
 929         }
 930
 931         return instructions[best_index];
 932 }
 933
 934 /* Still, we don't choose instructions in a vacuum. We need a way to choose the
 935  * best bundle type (ALU, load/store, texture). Nondestructive. */
 936
 937 static unsigned
 938 mir_choose_bundle(
 939                 midgard_instruction **instructions,
 940                 BITSET_WORD *worklist, unsigned count)
 941 {
 942         /* At the moment, our algorithm is very simple - use the bundle of the
 943          * best instruction, regardless of what else could be scheduled
 944          * alongside it. This is not optimal but it works okay for in-order */
 945
 946         struct midgard_predicate predicate = {
 947                 .tag = ~0,
 948                 .destructive = false,
 949                 .exclude = ~0
 950         };
 951
 952         midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate);
 953
 954         if (chosen)
 955                 return chosen->type;
 956         else
 957                 return ~0;
 958 }
 959
 960 /* When we are scheduling a branch/csel, we need the consumed condition in the
 961  * same block as a pipeline register. There are two options to enable this:
 962  *
 963  *  - Move the conditional into the bundle. Preferred, but only works if the
 964  *    conditional is used only once and is from this block.
 965  *  - Copy the conditional.
 966  *
 967  * We search for the conditional. If it's in this block, single-use, and
 968  * without embedded constants, we schedule it immediately. Otherwise, we
 969  * schedule a move for it.
 970  *
 971  * mir_comparison_mobile is a helper to find the moveable condition.
 972  */
 973
 974 static unsigned
 975 mir_comparison_mobile(
 976                 compiler_context *ctx,
 977                 midgard_instruction **instructions,
 978                 unsigned count,
 979                 unsigned cond)
 980 {
 981         if (!mir_single_use(ctx, cond))
 982                 return ~0;
 983
 984         unsigned ret = ~0;
 985
 986         for (unsigned i = 0; i < count; ++i) {
 987                 if (instructions[i]->dest != cond)
 988                         continue;
 989
 990                 /* Must fit in an ALU bundle */
 991                 if (instructions[i]->type != TAG_ALU_4)
 992                         return ~0;
 993
 994                 /* We'll need to rewrite to .w but that doesn't work for vector
 995                  * ops that don't replicate (ball/bany), so bail there */
 996
 997                 if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props))
 998                         return ~0;
 999
1000                 /* TODO: moving conditionals with constants */
1001
1002                 if (instructions[i]->has_constants)
1003                         return ~0;
1004
1005                 /* Ensure it is written only once */
1006
1007                 if (ret != ~0)
1008                         return ~0;
1009                 else
1010                         ret = i;
1011         }
1012
1013         return ret;
1014 }
1015
1016 /* Using the information about the moveable conditional itself, we either pop
1017  * that condition off the worklist for use now, or create a move to
1018  * artificially schedule instead as a fallback */
1019
1020 static midgard_instruction *
1021 mir_schedule_comparison(
1022                 compiler_context *ctx,
1023                 midgard_instruction **instructions,
1024                 BITSET_WORD *worklist, unsigned count,
1025                 unsigned cond, bool vector, unsigned swizzle,
1026                 midgard_instruction *user)
1027 {
1028         /* TODO: swizzle when scheduling */
1029         unsigned comp_i =
1030                 (!vector && (swizzle == 0)) ?
1031                 mir_comparison_mobile(ctx, instructions, count, cond) : ~0;
1032
1033         /* If we can, schedule the condition immediately */
1034         if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) {
1035                 assert(comp_i < count);
1036                 BITSET_CLEAR(worklist, comp_i);
1037                 return instructions[comp_i];
1038         }
1039
1040         /* Otherwise, we insert a move */
1041         midgard_vector_alu_src csel = {
1042                 .swizzle = swizzle
1043         };
1044
1045         midgard_instruction mov = v_mov(cond, csel, cond);
1046         mov.mask = vector ? 0xF : 0x1;
1047
1048         return mir_insert_instruction_before(ctx, user, mov);
1049 }
1050
1051 /* Most generally, we need instructions writing to r31 in the appropriate
1052  * components */
1053
1054 static midgard_instruction *
1055 mir_schedule_condition(compiler_context *ctx,
1056                 struct midgard_predicate *predicate,
1057                 BITSET_WORD *worklist, unsigned count,
1058                 midgard_instruction **instructions,
1059                 midgard_instruction *last)
1060 {
1061         /* For a branch, the condition is the only argument; for csel, third */
1062         bool branch = last->compact_branch;
1063         unsigned condition_index = branch ? 0 : 2;
1064
1065         /* csel_v is vector; otherwise, conditions are scalar */
1066         bool vector = !branch && OP_IS_CSEL_V(last->alu.op);
1067
1068         /* Grab the conditional instruction */
1069
1070         midgard_instruction *cond = mir_schedule_comparison(
1071                         ctx, instructions, worklist, count, last->src[condition_index],
1072                         vector, last->cond_swizzle, last);
1073
1074         /* We have exclusive reign over this (possibly move) conditional
1075          * instruction. We can rewrite into a pipeline conditional register */
1076
1077         predicate->exclude = cond->dest;
1078         cond->dest = SSA_FIXED_REGISTER(31);
1079
1080         if (!vector) {
1081                 cond->mask = (1 << COMPONENT_W);
1082
1083                 mir_foreach_src(cond, s) {
1084                         if (cond->src[s] == ~0)
1085                                 continue;
1086
1087                         mir_set_swizzle(cond, s, (mir_get_swizzle(cond, s) << (2*3)) & 0xFF);
1088                 }
1089         }
1090
1091         /* Schedule the unit: csel is always in the latter pipeline, so a csel
1092          * condition must be in the former pipeline stage (vmul/sadd),
1093          * depending on scalar/vector of the instruction itself. A branch must
1094          * be written from the latter pipeline stage and a branch condition is
1095          * always scalar, so it is always in smul (exception: ball/bany, which
1096          * will be vadd) */
1097
1098         if (branch)
1099                 cond->unit = UNIT_SMUL;
1100         else
1101                 cond->unit = vector ? UNIT_VMUL : UNIT_SADD;
1102
1103         return cond;
1104 }
1105
1106 /* Schedules a single bundle of the given type */
1107
1108 static midgard_bundle
1109 mir_schedule_texture(
1110                 midgard_instruction **instructions,
1111                 BITSET_WORD *worklist, unsigned len)
1112 {
1113         struct midgard_predicate predicate = {
1114                 .tag = TAG_TEXTURE_4,
1115                 .destructive = true,
1116                 .exclude = ~0
1117         };
1118
1119         midgard_instruction *ins =
1120                 mir_choose_instruction(instructions, worklist, len, &predicate);
1121
1122         mir_update_worklist(worklist, len, instructions, ins);
1123
1124         struct midgard_bundle out = {
1125                 .tag = TAG_TEXTURE_4,
1126                 .instruction_count = 1,
1127                 .instructions = { ins }
1128         };
1129
1130         return out;
1131 }
1132
1133 static midgard_bundle
1134 mir_schedule_ldst(
1135                 midgard_instruction **instructions,
1136                 BITSET_WORD *worklist, unsigned len)
1137 {
1138         struct midgard_predicate predicate = {
1139                 .tag = TAG_LOAD_STORE_4,
1140                 .destructive = true,
1141                 .exclude = ~0
1142         };
1143
1144         midgard_instruction *ins =
1145                 mir_choose_instruction(instructions, worklist, len, &predicate);
1146
1147         mir_update_worklist(worklist, len, instructions, ins);
1148
1149         struct midgard_bundle out = {
1150                 .tag = TAG_LOAD_STORE_4,
1151                 .instruction_count = 1,
1152                 .instructions = { ins }
1153         };
1154
1155         return out;
1156 }
1157
1158 static midgard_bundle
1159 mir_schedule_alu(
1160                 compiler_context *ctx,
1161                 midgard_instruction **instructions,
1162                 BITSET_WORD *worklist, unsigned len)
1163 {
1164         struct midgard_bundle bundle = {};
1165
1166         unsigned bytes_emitted = sizeof(bundle.control);
1167
1168         struct midgard_predicate predicate = {
1169                 .tag = TAG_ALU_4,
1170                 .destructive = true,
1171                 .exclude = ~0
1172         };
1173
1174         midgard_instruction *ins =
1175                 mir_choose_instruction(instructions, worklist, len, &predicate);
1176
1177         midgard_instruction *vmul = NULL;
1178         midgard_instruction *vadd = NULL;
1179         midgard_instruction *vlut = NULL;
1180         midgard_instruction *smul = NULL;
1181         midgard_instruction *sadd = NULL;
1182         midgard_instruction *branch = NULL;
1183
1184         mir_update_worklist(worklist, len, instructions, ins);
1185
1186         if (ins->compact_branch) {
1187                 branch = ins;
1188         } else if (!ins->unit) {
1189                 unsigned units = alu_opcode_props[ins->alu.op].props;
1190
1191                 if (units & UNIT_VMUL) {
1192                         ins->unit = UNIT_VMUL;
1193                         vmul = ins;
1194                 } else if (units & UNIT_VADD) {
1195                         ins->unit = UNIT_VADD;
1196                         vadd = ins;
1197                 } else if (units & UNIT_VLUT) {
1198                         ins->unit = UNIT_VLUT;
1199                         vlut = ins;
1200                 } else
1201                         assert(0);
1202         }
1203
1204         bundle.has_embedded_constants = ins->has_constants;
1205         bundle.has_blend_constant = ins->has_blend_constant;
1206
1207         if (ins->alu.reg_mode == midgard_reg_mode_16) {
1208               /* TODO: Fix packing XXX */
1209                 uint16_t *bundles = (uint16_t *) bundle.constants;
1210                 uint32_t *constants = (uint32_t *) ins->constants;
1211
1212                 /* Copy them wholesale */
1213                 for (unsigned i = 0; i < 4; ++i)
1214                         bundles[i] = constants[i];
1215         } else {
1216                 memcpy(bundle.constants, ins->constants, sizeof(bundle.constants));
1217         }
1218
1219         if (ins->writeout) {
1220                 unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0];
1221                 unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx);
1222                 midgard_instruction mov = v_mov(src, blank_alu_src, temp);
1223                 vmul = mem_dup(&mov, sizeof(midgard_instruction));
1224                 vmul->unit = UNIT_VMUL;
1225                 vmul->mask = 0xF;
1226                 /* TODO: Don't leak */
1227
1228                 /* Rewrite to use our temp */
1229                 midgard_instruction *stages[] = { sadd, vadd, smul };
1230
1231                 for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1232                         if (stages[i])
1233                                 mir_rewrite_index_dst_single(stages[i], src, temp);
1234                 }
1235
1236                 mir_rewrite_index_src_single(branch, src, temp);
1237         }
1238
1239         if ((vadd && OP_IS_CSEL(vadd->alu.op)) || (smul && OP_IS_CSEL(smul->alu.op)) || (ins->compact_branch && !ins->prepacked_branch && ins->branch.conditional)) {
1240                 midgard_instruction *cond = mir_choose_instruction(instructions, worklist, len, &predicate);
1241                 mir_update_worklist(worklist, len, instructions, cond);
1242
1243                 if (!cond->unit) {
1244                         unsigned units = alu_opcode_props[cond->alu.op].props;
1245
1246                         if (units & UNIT_VMUL) {
1247                                 cond->unit = UNIT_VMUL;
1248                         } else if (units & UNIT_VADD) {
1249                                 cond->unit = UNIT_VADD;
1250                         } else
1251                                 assert(0);
1252                 }
1253
1254                 if (cond->unit & UNIT_VMUL)
1255                         vmul = cond;
1256                 else if (cond->unit & UNIT_SADD)
1257                         sadd = cond;
1258                 else if (cond->unit & UNIT_VADD)
1259                         vadd = cond;
1260                 else if (cond->unit & UNIT_SMUL)
1261                         smul = cond;
1262                 else
1263                         unreachable("Bad condition");
1264         }
1265
1266         unsigned padding = 0;
1267
1268         /* Now that we have finished scheduling, build up the bundle */
1269         midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch };
1270
1271         for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1272                 if (stages[i]) {
1273                         bundle.control |= stages[i]->unit;
1274                         bytes_emitted += bytes_for_instruction(stages[i]);
1275                         bundle.instructions[bundle.instruction_count++] = stages[i];
1276                 }
1277         }
1278
1279         /* Pad ALU op to nearest word */
1280
1281         if (bytes_emitted & 15) {
1282                 padding = 16 - (bytes_emitted & 15);
1283                 bytes_emitted += padding;
1284         }
1285
1286         /* Constants must always be quadwords */
1287         if (bundle.has_embedded_constants)
1288                 bytes_emitted += 16;
1289
1290         /* Size ALU instruction for tag */
1291         bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
1292         bundle.padding = padding;
1293         bundle.control |= bundle.tag;
1294
1295         return bundle;
1296 }
1297
1298 /* Schedule a single block by iterating its instruction to create bundles.
1299  * While we go, tally about the bundle sizes to compute the block size. */
1300
1301
1302 static void
1303 schedule_block(compiler_context *ctx, midgard_block *block)
1304 {
1305         /* Copy list to dynamic array */
1306         unsigned len = 0;
1307         midgard_instruction **instructions = flatten_mir(block, &len);
1308
1309         /* Calculate dependencies and initial worklist */
1310         unsigned node_count = ctx->temp_count + 1;
1311         mir_create_dependency_graph(instructions, len, node_count);
1312
1313         /* Allocate the worklist */
1314         size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
1315         BITSET_WORD *worklist = calloc(sz, 1);
1316         mir_initialize_worklist(worklist, instructions, len);
1317
1318         util_dynarray_init(&block->bundles, NULL);
1319
1320         block->quadword_count = 0;
1321
1322         int skip = 0;
1323         mir_foreach_instr_in_block(block, ins) {
1324                 if (skip) {
1325                         skip--;
1326                         continue;
1327                 }
1328
1329                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
1330                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
1331
1332                 if (bundle.has_blend_constant) {
1333                         unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
1334                         ctx->blend_constant_offset = offset * 0x10;
1335                 }
1336
1337                 block->quadword_count += quadword_size(bundle.tag);
1338         }
1339
1340         block->is_scheduled = true;
1341         ctx->quadword_count += block->quadword_count;
1342 }
1343
1344 /* The following passes reorder MIR instructions to enable better scheduling */
1345
1346 static void
1347 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
1348 {
1349         mir_foreach_instr_in_block_safe(block, ins) {
1350                 if (ins->type != TAG_LOAD_STORE_4) continue;
1351
1352                 /* We've found a load/store op. Check if next is also load/store. */
1353                 midgard_instruction *next_op = mir_next_op(ins);
1354                 if (&next_op->link != &block->instructions) {
1355                         if (next_op->type == TAG_LOAD_STORE_4) {
1356                                 /* If so, we're done since we're a pair */
1357                                 ins = mir_next_op(ins);
1358                                 continue;
1359                         }
1360
1361                         /* Maximum search distance to pair, to avoid register pressure disasters */
1362                         int search_distance = 8;
1363
1364                         /* Otherwise, we have an orphaned load/store -- search for another load */
1365                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
1366                                 /* Terminate search if necessary */
1367                                 if (!(search_distance--)) break;
1368
1369                                 if (c->type != TAG_LOAD_STORE_4) continue;
1370
1371                                 /* We can only reorder if there are no sources */
1372
1373                                 bool deps = false;
1374
1375                                 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
1376                                         deps |= (c->src[s] != ~0);
1377
1378                                 if (deps)
1379                                         continue;
1380
1381                                 /* We found one! Move it up to pair and remove it from the old location */
1382
1383                                 mir_insert_instruction_before(ctx, ins, *c);
1384                                 mir_remove_instruction(c);
1385
1386                                 break;
1387                         }
1388                 }
1389         }
1390 }
1391
1392 /* When we're 'squeezing down' the values in the IR, we maintain a hash
1393  * as such */
1394
1395 static unsigned
1396 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
1397 {
1398         if (hash >= SSA_FIXED_MINIMUM)
1399                 return hash;
1400
1401         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
1402                                 ctx->hash_to_temp, hash + 1);
1403
1404         if (temp)
1405                 return temp - 1;
1406
1407         /* If no temp is find, allocate one */
1408         temp = ctx->temp_count++;
1409         ctx->max_hash = MAX2(ctx->max_hash, hash);
1410
1411         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
1412                                     hash + 1, (void *) ((uintptr_t) temp + 1));
1413
1414         return temp;
1415 }
1416
1417 /* Reassigns numbering to get rid of gaps in the indices */
1418
1419 static void
1420 mir_squeeze_index(compiler_context *ctx)
1421 {
1422         /* Reset */
1423         ctx->temp_count = 0;
1424         /* TODO don't leak old hash_to_temp */
1425         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
1426
1427         mir_foreach_instr_global(ctx, ins) {
1428                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
1429
1430                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
1431                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
1432         }
1433 }
1434
1435 static midgard_instruction
1436 v_load_store_scratch(
1437                 unsigned srcdest,
1438                 unsigned index,
1439                 bool is_store,
1440                 unsigned mask)
1441 {
1442         /* We index by 32-bit vec4s */
1443         unsigned byte = (index * 4 * 4);
1444
1445         midgard_instruction ins = {
1446                 .type = TAG_LOAD_STORE_4,
1447                 .mask = mask,
1448                 .dest = ~0,
1449                 .src = { ~0, ~0, ~0 },
1450                 .load_store = {
1451                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
1452                         .swizzle = SWIZZLE_XYZW,
1453
1454                         /* For register spilling - to thread local storage */
1455                         .arg_1 = 0xEA,
1456                         .arg_2 = 0x1E,
1457
1458                         /* Splattered across, TODO combine logically */
1459                         .varying_parameters = (byte & 0x1FF) << 1,
1460                         .address = (byte >> 9)
1461                 },
1462
1463                 /* If we spill an unspill, RA goes into an infinite loop */
1464                 .no_spill = true
1465         };
1466
1467        if (is_store) {
1468                 /* r0 = r26, r1 = r27 */
1469                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
1470                 ins.src[0] = srcdest;
1471         } else {
1472                 ins.dest = srcdest;
1473         }
1474
1475         return ins;
1476 }
1477
1478 /* If register allocation fails, find the best spill node and spill it to fix
1479  * whatever the issue was. This spill node could be a work register (spilling
1480  * to thread local storage), but it could also simply be a special register
1481  * that needs to spill to become a work register. */
1482
1483 static void mir_spill_register(
1484                 compiler_context *ctx,
1485                 struct ra_graph *g,
1486                 unsigned *spill_count)
1487 {
1488         unsigned spill_index = ctx->temp_count;
1489
1490         /* Our first step is to calculate spill cost to figure out the best
1491          * spill node. All nodes are equal in spill cost, but we can't spill
1492          * nodes written to from an unspill */
1493
1494         for (unsigned i = 0; i < ctx->temp_count; ++i) {
1495                 ra_set_node_spill_cost(g, i, 1.0);
1496         }
1497
1498         /* We can't spill any bundles that contain unspills. This could be
1499          * optimized to allow use of r27 to spill twice per bundle, but if
1500          * you're at the point of optimizing spilling, it's too late. */
1501
1502         mir_foreach_block(ctx, block) {
1503                 mir_foreach_bundle_in_block(block, bun) {
1504                         bool no_spill = false;
1505
1506                         for (unsigned i = 0; i < bun->instruction_count; ++i)
1507                                 no_spill |= bun->instructions[i]->no_spill;
1508
1509                         if (!no_spill)
1510                                 continue;
1511
1512                         for (unsigned i = 0; i < bun->instruction_count; ++i) {
1513                                 unsigned dest = bun->instructions[i]->dest;
1514                                 if (dest < ctx->temp_count)
1515                                         ra_set_node_spill_cost(g, dest, -1.0);
1516                         }
1517                 }
1518         }
1519
1520         int spill_node = ra_get_best_spill_node(g);
1521
1522         if (spill_node < 0) {
1523                 mir_print_shader(ctx);
1524                 assert(0);
1525         }
1526
1527         /* We have a spill node, so check the class. Work registers
1528          * legitimately spill to TLS, but special registers just spill to work
1529          * registers */
1530
1531         unsigned class = ra_get_node_class(g, spill_node);
1532         bool is_special = (class >> 2) != REG_CLASS_WORK;
1533         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
1534
1535         /* Allocate TLS slot (maybe) */
1536         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
1537
1538         /* For TLS, replace all stores to the spilled node. For
1539          * special reads, just keep as-is; the class will be demoted
1540          * implicitly. For special writes, spill to a work register */
1541
1542         if (!is_special || is_special_w) {
1543                 if (is_special_w)
1544                         spill_slot = spill_index++;
1545
1546                 mir_foreach_block(ctx, block) {
1547                 mir_foreach_instr_in_block_safe(block, ins) {
1548                         if (ins->dest != spill_node) continue;
1549
1550                         midgard_instruction st;
1551
1552                         if (is_special_w) {
1553                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
1554                                 st.no_spill = true;
1555                         } else {
1556                                 ins->dest = SSA_FIXED_REGISTER(26);
1557                                 ins->no_spill = true;
1558                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
1559                         }
1560
1561                         /* Hint: don't rewrite this node */
1562                         st.hint = true;
1563
1564                         mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1565
1566                         if (!is_special)
1567                                 ctx->spills++;
1568                 }
1569                 }
1570         }
1571
1572         /* For special reads, figure out how many components we need */
1573         unsigned read_mask = 0;
1574
1575         mir_foreach_instr_global_safe(ctx, ins) {
1576                 read_mask |= mir_mask_of_read_components(ins, spill_node);
1577         }
1578
1579         /* Insert a load from TLS before the first consecutive
1580          * use of the node, rewriting to use spilled indices to
1581          * break up the live range. Or, for special, insert a
1582          * move. Ironically the latter *increases* register
1583          * pressure, but the two uses of the spilling mechanism
1584          * are somewhat orthogonal. (special spilling is to use
1585          * work registers to back special registers; TLS
1586          * spilling is to use memory to back work registers) */
1587
1588         mir_foreach_block(ctx, block) {
1589                 bool consecutive_skip = false;
1590                 unsigned consecutive_index = 0;
1591
1592                 mir_foreach_instr_in_block(block, ins) {
1593                         /* We can't rewrite the moves used to spill in the
1594                          * first place. These moves are hinted. */
1595                         if (ins->hint) continue;
1596
1597                         if (!mir_has_arg(ins, spill_node)) {
1598                                 consecutive_skip = false;
1599                                 continue;
1600                         }
1601
1602                         if (consecutive_skip) {
1603                                 /* Rewrite */
1604                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1605                                 continue;
1606                         }
1607
1608                         if (!is_special_w) {
1609                                 consecutive_index = ++spill_index;
1610
1611                                 midgard_instruction *before = ins;
1612
1613                                 /* For a csel, go back one more not to break up the bundle */
1614                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
1615                                         before = mir_prev_op(before);
1616
1617                                 midgard_instruction st;
1618
1619                                 if (is_special) {
1620                                         /* Move */
1621                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
1622                                         st.no_spill = true;
1623                                 } else {
1624                                         /* TLS load */
1625                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
1626                                 }
1627
1628                                 /* Mask the load based on the component count
1629                                  * actually needed to prvent RA loops */
1630
1631                                 st.mask = read_mask;
1632
1633                                 mir_insert_instruction_before_scheduled(ctx, block, before, st);
1634                                // consecutive_skip = true;
1635                         } else {
1636                                 /* Special writes already have their move spilled in */
1637                                 consecutive_index = spill_slot;
1638                         }
1639
1640
1641                         /* Rewrite to use */
1642                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1643
1644                         if (!is_special)
1645                                 ctx->fills++;
1646                 }
1647         }
1648
1649         /* Reset hints */
1650
1651         mir_foreach_instr_global(ctx, ins) {
1652                 ins->hint = false;
1653         }
1654 }
1655
1656 void
1657 schedule_program(compiler_context *ctx)
1658 {
1659         struct ra_graph *g = NULL;
1660         bool spilled = false;
1661         int iter_count = 1000; /* max iterations */
1662
1663         /* Number of 128-bit slots in memory we've spilled into */
1664         unsigned spill_count = 0;
1665
1666         midgard_promote_uniforms(ctx, 16);
1667
1668         mir_foreach_block(ctx, block) {
1669                 midgard_pair_load_store(ctx, block);
1670         }
1671
1672         /* Must be lowered right before RA */
1673         mir_squeeze_index(ctx);
1674         mir_lower_special_reads(ctx);
1675         mir_squeeze_index(ctx);
1676
1677         /* Lowering can introduce some dead moves */
1678
1679         mir_foreach_block(ctx, block) {
1680                 midgard_opt_dead_move_eliminate(ctx, block);
1681                 schedule_block(ctx, block);
1682         }
1683
1684         mir_create_pipeline_registers(ctx);
1685
1686         do {
1687                 if (spilled)
1688                         mir_spill_register(ctx, g, &spill_count);
1689
1690                 mir_squeeze_index(ctx);
1691
1692                 g = NULL;
1693                 g = allocate_registers(ctx, &spilled);
1694         } while(spilled && ((iter_count--) > 0));
1695
1696         if (iter_count <= 0) {
1697                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1698                 assert(0);
1699         }
1700
1701         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1702          * fp32), but tls_size is in bytes, so multiply by 16 */
1703
1704         ctx->tls_size = spill_count * 16;
1705
1706         install_registers(ctx, g);
1707 }