src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
  30  * must be grouped into VLIW bundles according to following model:
  31  *
  32  * [VMUL] [SADD]
  33  * [VADD] [SMUL] [VLUT]
  34  *
  35  * A given instruction can execute on some subset of the units (or a few can
  36  * execute on all). Instructions can be either vector or scalar; only scalar
  37  * instructions can execute on SADD/SMUL units. Units on a given line execute
  38  * in parallel. Subsequent lines execute separately and can pass results
  39  * directly via pipeline registers r24/r25, bypassing the register file.
  40  *
  41  * A bundle can optionally have 128-bits of embedded constants, shared across
  42  * all of the instructions within a bundle.
  43  *
  44  * Instructions consuming conditionals (branches and conditional selects)
  45  * require their condition to be written into the conditional register (r31)
  46  * within the same bundle they are consumed.
  47  *
  48  * Fragment writeout requires its argument to be written in full within the
  49  * same bundle as the branch, with no hanging dependencies.
  50  *
  51  * Load/store instructions are also in bundles of simply two instructions, and
  52  * texture instructions have no bundling.
  53  *
  54  * -------------------------------------------------------------------------
  55  *
  56  */
  57
  58 /* We create the dependency graph with per-component granularity */
  59
  60 #define COMPONENT_COUNT 8
  61
  62 static void
  63 add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child)
  64 {
  65         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  66                 if (!(mask & (1 << i)))
  67                         continue;
  68
  69                 struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i];
  70
  71                 util_dynarray_foreach(parents, unsigned, parent) {
  72                         BITSET_WORD *dependents = instructions[*parent]->dependents;
  73
  74                         /* Already have the dependency */
  75                         if (BITSET_TEST(dependents, child))
  76                                 continue;
  77
  78                         BITSET_SET(dependents, child);
  79                         instructions[child]->nr_dependencies++;
  80                 }
  81         }
  82 }
  83
  84 static void
  85 mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent)
  86 {
  87         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  88                 if (!(mask & (1 << i)))
  89                         continue;
  90
  91                 util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent);
  92         }
  93 }
  94
  95 static void
  96 mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)
  97 {
  98         size_t sz = node_count * COMPONENT_COUNT;
  99
 100         struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
 101         struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
 102
 103         for (unsigned i = 0; i < sz; ++i) {
 104                 util_dynarray_init(&last_read[i], NULL);
 105                 util_dynarray_init(&last_write[i], NULL);
 106         }
 107
 108         /* Initialize dependency graph */
 109         for (unsigned i = 0; i < count; ++i) {
 110                 instructions[i]->dependents =
 111                         calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));
 112
 113                 instructions[i]->nr_dependencies = 0;
 114         }
 115
 116         /* Populate dependency graph */
 117         for (signed i = count - 1; i >= 0; --i) {
 118                 if (instructions[i]->compact_branch)
 119                         continue;
 120
 121                 unsigned dest = instructions[i]->dest;
 122                 unsigned mask = instructions[i]->mask;
 123
 124                 mir_foreach_src((*instructions), s) {
 125                         unsigned src = instructions[i]->src[s];
 126
 127                         if (src < node_count) {
 128                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 129                                 add_dependency(last_write, src, readmask, instructions, i);
 130                         }
 131                 }
 132
 133                 if (dest < node_count) {
 134                         add_dependency(last_read, dest, mask, instructions, i);
 135                         add_dependency(last_write, dest, mask, instructions, i);
 136                         mark_access(last_write, dest, mask, i);
 137                 }
 138
 139                 mir_foreach_src((*instructions), s) {
 140                         unsigned src = instructions[i]->src[s];
 141
 142                         if (src < node_count) {
 143                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 144                                 mark_access(last_read, src, readmask, i);
 145                         }
 146                 }
 147         }
 148
 149         /* If there is a branch, all instructions depend on it, as interblock
 150          * execution must be purely in-order */
 151
 152         if (instructions[count - 1]->compact_branch) {
 153                 BITSET_WORD *dependents = instructions[count - 1]->dependents;
 154
 155                 for (signed i = count - 2; i >= 0; --i) {
 156                         if (BITSET_TEST(dependents, i))
 157                                 continue;
 158
 159                         BITSET_SET(dependents, i);
 160                         instructions[i]->nr_dependencies++;
 161                 }
 162         }
 163
 164         /* Free the intermediate structures */
 165         for (unsigned i = 0; i < sz; ++i) {
 166                 util_dynarray_fini(&last_read[i]);
 167                 util_dynarray_fini(&last_write[i]);
 168         }
 169 }
 170
 171 /* Create a mask of accessed components from a swizzle to figure out vector
 172  * dependencies */
 173
 174 static unsigned
 175 swizzle_to_access_mask(unsigned swizzle)
 176 {
 177         unsigned component_mask = 0;
 178
 179         for (int i = 0; i < 4; ++i) {
 180                 unsigned c = (swizzle >> (2 * i)) & 3;
 181                 component_mask |= (1 << c);
 182         }
 183
 184         return component_mask;
 185 }
 186
 187 /* Does the mask cover more than a scalar? */
 188
 189 static bool
 190 is_single_component_mask(unsigned mask)
 191 {
 192         int components = 0;
 193
 194         for (int c = 0; c < 8; ++c) {
 195                 if (mask & (1 << c))
 196                         components++;
 197         }
 198
 199         return components == 1;
 200 }
 201
 202 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
 203  * mind that we are a vector architecture and we can write to different
 204  * components simultaneously */
 205
 206 static bool
 207 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
 208 {
 209         /* Writeout has its own rules anyway */
 210         if (first->compact_branch || second->compact_branch)
 211                 return true;
 212
 213         /* Each instruction reads some registers and writes to a register. See
 214          * where the first writes */
 215
 216         int source = first->dest;
 217         int source_mask = first->mask;
 218
 219         /* As long as the second doesn't read from the first, we're okay */
 220         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
 221                 if (second->src[i] != source)
 222                         continue;
 223
 224                 if (first->type != TAG_ALU_4)
 225                         return false;
 226
 227                 /* Figure out which components we just read from */
 228
 229                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
 230                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
 231
 232                 /* Check if there are components in common, and fail if so */
 233                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
 234                         return false;
 235         }
 236
 237         /* Otherwise, it's safe in that regard. Another data hazard is both
 238          * writing to the same place, of course */
 239
 240         if (second->dest == source) {
 241                 /* ...but only if the components overlap */
 242
 243                 if (second->mask & source_mask)
 244                         return false;
 245         }
 246
 247         /* ...That's it */
 248         return true;
 249 }
 250
 251 static bool
 252 midgard_has_hazard(
 253         midgard_instruction **segment, unsigned segment_size,
 254         midgard_instruction *ains)
 255 {
 256         for (int s = 0; s < segment_size; ++s)
 257                 if (!can_run_concurrent_ssa(segment[s], ains))
 258                         return true;
 259
 260         return false;
 261
 262
 263 }
 264
 265 /* Fragment writeout (of r0) is allowed when:
 266  *
 267  *  - All components of r0 are written in the bundle
 268  *  - No components of r0 are written in VLUT
 269  *  - Non-pipelined dependencies of r0 are not written in the bundle
 270  *
 271  * This function checks if these requirements are satisfied given the content
 272  * of a scheduled bundle.
 273  */
 274
 275 static bool
 276 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count, unsigned r0)
 277 {
 278         /* First scan for which components of r0 are written out. Initially
 279          * none are written */
 280
 281         uint8_t r0_written_mask = 0x0;
 282
 283         /* Simultaneously we scan for the set of dependencies */
 284
 285         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 286         BITSET_WORD *dependencies = calloc(1, sz);
 287         memset(dependencies, 0, sz);
 288
 289         bool success = false;
 290
 291         for (unsigned i = 0; i < count; ++i) {
 292                 midgard_instruction *ins = bundle[i];
 293
 294                 if (ins->dest != r0)
 295                         continue;
 296
 297                 /* Record written out mask */
 298                 r0_written_mask |= ins->mask;
 299
 300                 /* Record dependencies, but only if they won't become pipeline
 301                  * registers. We know we can't be live after this, because
 302                  * we're writeout at the very end of the shader. So check if
 303                  * they were written before us. */
 304
 305                 unsigned src0 = ins->src[0];
 306                 unsigned src1 = ins->src[1];
 307
 308                 if (!mir_is_written_before(ctx, bundle[0], src0))
 309                         src0 = ~0;
 310
 311                 if (!mir_is_written_before(ctx, bundle[0], src1))
 312                         src1 = ~0;
 313
 314                 if (src0 < node_count)
 315                         BITSET_SET(dependencies, src0);
 316
 317                 if (src1 < node_count)
 318                         BITSET_SET(dependencies, src1);
 319
 320                 /* Requirement 2 */
 321                 if (ins->unit == UNIT_VLUT)
 322                         goto done;
 323         }
 324
 325         /* Requirement 1 */
 326         if ((r0_written_mask & 0xF) != 0xF)
 327                 goto done;
 328
 329         /* Requirement 3 */
 330
 331         for (unsigned i = 0; i < count; ++i) {
 332                 unsigned dest = bundle[i]->dest;
 333
 334                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 335                         goto done;
 336         }
 337
 338         /* Otherwise, we're good to go */
 339         success = true;
 340
 341 done:
 342         free(dependencies);
 343         return success;
 344 }
 345
 346 /* Helpers for scheudling */
 347
 348 static bool
 349 mir_is_scalar(midgard_instruction *ains)
 350 {
 351         /* Does the op support scalar units? */
 352         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 353                 return false;
 354
 355         /* Do we try to use it as a vector op? */
 356         if (!is_single_component_mask(ains->mask))
 357                 return false;
 358
 359         /* Otherwise, check mode hazards */
 360         bool could_scalar = true;
 361
 362         /* Only 16/32-bit can run on a scalar unit */
 363         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 364         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 365         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 366
 367         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 368                 /* If we're running in 16-bit mode, we
 369                  * can't have any 8-bit sources on the
 370                  * scalar unit (since the scalar unit
 371                  * doesn't understand 8-bit) */
 372
 373                 midgard_vector_alu_src s1 =
 374                         vector_alu_from_unsigned(ains->alu.src1);
 375
 376                 could_scalar &= !s1.half;
 377
 378                 midgard_vector_alu_src s2 =
 379                         vector_alu_from_unsigned(ains->alu.src2);
 380
 381                 could_scalar &= !s2.half;
 382         }
 383
 384         return could_scalar;
 385 }
 386
 387 /* How many bytes does this ALU instruction add to the bundle? */
 388
 389 static unsigned
 390 bytes_for_instruction(midgard_instruction *ains)
 391 {
 392         if (ains->unit & UNITS_ANY_VECTOR)
 393                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 394         else if (ains->unit == ALU_ENAB_BRANCH)
 395                 return sizeof(midgard_branch_extended);
 396         else if (ains->compact_branch)
 397                 return sizeof(ains->br_compact);
 398         else
 399                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 400 }
 401
 402 /* Schedules, but does not emit, a single basic block. After scheduling, the
 403  * final tag and size of the block are known, which are necessary for branching
 404  * */
 405
 406 static midgard_bundle
 407 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 408 {
 409         int instructions_emitted = 0, packed_idx = 0;
 410         midgard_bundle bundle = { 0 };
 411
 412         midgard_instruction *scheduled[5] = { NULL };
 413
 414         uint8_t tag = ins->type;
 415
 416         /* Default to the instruction's tag */
 417         bundle.tag = tag;
 418
 419         switch (ins->type) {
 420         case TAG_ALU_4: {
 421                 uint32_t control = 0;
 422                 size_t bytes_emitted = sizeof(control);
 423
 424                 /* TODO: Constant combining */
 425                 int index = 0, last_unit = 0;
 426
 427                 /* Previous instructions, for the purpose of parallelism */
 428                 midgard_instruction *segment[4] = {0};
 429                 int segment_size = 0;
 430
 431                 instructions_emitted = -1;
 432                 midgard_instruction *pins = ins;
 433
 434                 unsigned constant_count = 0;
 435
 436                 for (;;) {
 437                         midgard_instruction *ains = pins;
 438
 439                         /* Advance instruction pointer */
 440                         if (index) {
 441                                 ains = mir_next_op(pins);
 442                                 pins = ains;
 443                         }
 444
 445                         /* Out-of-work condition */
 446                         if ((struct list_head *) ains == &block->instructions)
 447                                 break;
 448
 449                         /* Ensure that the chain can continue */
 450                         if (ains->type != TAG_ALU_4) break;
 451
 452                         /* If there's already something in the bundle and we
 453                          * have weird scheduler constraints, break now */
 454                         if (ains->precede_break && index) break;
 455
 456                         /* According to the presentation "The ARM
 457                          * Mali-T880 Mobile GPU" from HotChips 27,
 458                          * there are two pipeline stages. Branching
 459                          * position determined experimentally. Lines
 460                          * are executed in parallel:
 461                          *
 462                          * [ VMUL ] [ SADD ]
 463                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 464                          *
 465                          * Verify that there are no ordering dependencies here.
 466                          *
 467                          * TODO: Allow for parallelism!!!
 468                          */
 469
 470                         /* Pick a unit for it if it doesn't force a particular unit */
 471
 472                         int unit = ains->unit;
 473
 474                         if (!unit) {
 475                                 int op = ains->alu.op;
 476                                 int units = alu_opcode_props[op].props;
 477                                 bool scalar = mir_is_scalar(ains);
 478
 479                                 if (!scalar) {
 480                                         if (last_unit >= UNIT_VADD) {
 481                                                 if (units & UNIT_VLUT)
 482                                                         unit = UNIT_VLUT;
 483                                                 else
 484                                                         break;
 485                                         } else {
 486                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 487                                                         unit = UNIT_VMUL;
 488                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 489                                                         unit = UNIT_VADD;
 490                                                 else if (units & UNIT_VLUT)
 491                                                         unit = UNIT_VLUT;
 492                                                 else
 493                                                         break;
 494                                         }
 495                                 } else {
 496                                         if (last_unit >= UNIT_VADD) {
 497                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 498                                                         unit = UNIT_SMUL;
 499                                                 else if (units & UNIT_VLUT)
 500                                                         unit = UNIT_VLUT;
 501                                                 else
 502                                                         break;
 503                                         } else {
 504                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 505                                                         unit = UNIT_VMUL;
 506                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 507                                                         unit = UNIT_SADD;
 508                                                 else if (units & UNIT_VADD)
 509                                                         unit = UNIT_VADD;
 510                                                 else if (units & UNIT_SMUL)
 511                                                         unit = UNIT_SMUL;
 512                                                 else if (units & UNIT_VLUT)
 513                                                         unit = UNIT_VLUT;
 514                                                 else
 515                                                         break;
 516                                         }
 517                                 }
 518
 519                                 assert(unit & units);
 520                         }
 521
 522                         /* Late unit check, this time for encoding (not parallelism) */
 523                         if (unit <= last_unit) break;
 524
 525                         /* Clear the segment */
 526                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 527                                 segment_size = 0;
 528
 529                         if (midgard_has_hazard(segment, segment_size, ains))
 530                                 break;
 531
 532                         /* We're good to go -- emit the instruction */
 533                         ains->unit = unit;
 534
 535                         segment[segment_size++] = ains;
 536
 537                         /* We try to reuse constants if possible, by adjusting
 538                          * the swizzle */
 539
 540                         if (ains->has_blend_constant) {
 541                                 /* Everything conflicts with the blend constant */
 542                                 if (bundle.has_embedded_constants)
 543                                         break;
 544
 545                                 bundle.has_blend_constant = 1;
 546                                 bundle.has_embedded_constants = 1;
 547                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 548                                 /* TODO: DRY with the analysis pass */
 549
 550                                 if (bundle.has_blend_constant)
 551                                         break;
 552
 553                                 if (constant_count)
 554                                         break;
 555
 556                                 /* TODO: Fix packing XXX */
 557                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 558                                 uint32_t *constants = (uint32_t *) ains->constants;
 559
 560                                 /* Copy them wholesale */
 561                                 for (unsigned i = 0; i < 4; ++i)
 562                                         bundles[i] = constants[i];
 563
 564                                 bundle.has_embedded_constants = true;
 565                                 constant_count = 4;
 566                         } else if (ains->has_constants) {
 567                                 /* By definition, blend constants conflict with
 568                                  * everything, so if there are already
 569                                  * constants we break the bundle *now* */
 570
 571                                 if (bundle.has_blend_constant)
 572                                         break;
 573
 574                                 /* For anything but blend constants, we can do
 575                                  * proper analysis, however */
 576
 577                                 /* TODO: Mask by which are used */
 578                                 uint32_t *constants = (uint32_t *) ains->constants;
 579                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 580
 581                                 uint32_t indices[4] = { 0 };
 582                                 bool break_bundle = false;
 583
 584                                 for (unsigned i = 0; i < 4; ++i) {
 585                                         uint32_t cons = constants[i];
 586                                         bool constant_found = false;
 587
 588                                         /* Search for the constant */
 589                                         for (unsigned j = 0; j < constant_count; ++j) {
 590                                                 if (bundles[j] != cons)
 591                                                         continue;
 592
 593                                                 /* We found it, reuse */
 594                                                 indices[i] = j;
 595                                                 constant_found = true;
 596                                                 break;
 597                                         }
 598
 599                                         if (constant_found)
 600                                                 continue;
 601
 602                                         /* We didn't find it, so allocate it */
 603                                         unsigned idx = constant_count++;
 604
 605                                         if (idx >= 4) {
 606                                                 /* Uh-oh, out of space */
 607                                                 break_bundle = true;
 608                                                 break;
 609                                         }
 610
 611                                         /* We have space, copy it in! */
 612                                         bundles[idx] = cons;
 613                                         indices[i] = idx;
 614                                 }
 615
 616                                 if (break_bundle)
 617                                         break;
 618
 619                                 /* Cool, we have it in. So use indices as a
 620                                  * swizzle */
 621
 622                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 623                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 624
 625                                 if (ains->src[0] == r_constant)
 626                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 627
 628                                 if (ains->src[1] == r_constant)
 629                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 630
 631                                 bundle.has_embedded_constants = true;
 632                         }
 633
 634                         if (ains->compact_branch) {
 635                                 /* All of r0 has to be written out along with
 636                                  * the branch writeout */
 637
 638                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count, ains->src[0])) {
 639                                         /* We only work on full moves
 640                                          * at the beginning. We could
 641                                          * probably do better */
 642                                         if (index != 0)
 643                                                 break;
 644
 645                                         /* Inject a move */
 646                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 647                                         ins.unit = UNIT_VMUL;
 648                                         control |= ins.unit;
 649
 650                                         /* TODO don't leak */
 651                                         midgard_instruction *move =
 652                                                 mem_dup(&ins, sizeof(midgard_instruction));
 653                                         bytes_emitted += bytes_for_instruction(move);
 654                                         bundle.instructions[packed_idx++] = move;
 655                                 }
 656                         }
 657
 658                         bytes_emitted += bytes_for_instruction(ains);
 659
 660                         /* Defer marking until after writing to allow for break */
 661                         scheduled[index] = ains;
 662                         control |= ains->unit;
 663                         last_unit = ains->unit;
 664                         ++instructions_emitted;
 665                         ++index;
 666                 }
 667
 668                 int padding = 0;
 669
 670                 /* Pad ALU op to nearest word */
 671
 672                 if (bytes_emitted & 15) {
 673                         padding = 16 - (bytes_emitted & 15);
 674                         bytes_emitted += padding;
 675                 }
 676
 677                 /* Constants must always be quadwords */
 678                 if (bundle.has_embedded_constants)
 679                         bytes_emitted += 16;
 680
 681                 /* Size ALU instruction for tag */
 682                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 683                 bundle.padding = padding;
 684                 bundle.control = bundle.tag | control;
 685
 686                 break;
 687         }
 688
 689         case TAG_LOAD_STORE_4: {
 690                 /* Load store instructions have two words at once. If
 691                  * we only have one queued up, we need to NOP pad.
 692                  * Otherwise, we store both in succession to save space
 693                  * and cycles -- letting them go in parallel -- skip
 694                  * the next. The usefulness of this optimisation is
 695                  * greatly dependent on the quality of the instruction
 696                  * scheduler.
 697                  */
 698
 699                 midgard_instruction *next_op = mir_next_op(ins);
 700
 701                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 702                         /* TODO: Concurrency check */
 703                         instructions_emitted++;
 704                 }
 705
 706                 break;
 707         }
 708
 709         case TAG_TEXTURE_4: {
 710                 /* Which tag we use depends on the shader stage */
 711                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 712                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 713                 break;
 714         }
 715
 716         default:
 717                 unreachable("Unknown tag");
 718                 break;
 719         }
 720
 721         /* Copy the instructions into the bundle */
 722         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 723
 724         midgard_instruction *uins = ins;
 725         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 726                 assert(&uins->link != &block->instructions);
 727                 bundle.instructions[packed_idx] = uins;
 728                 uins = mir_next_op(uins);
 729         }
 730
 731         *skip = instructions_emitted;
 732
 733         return bundle;
 734 }
 735
 736 /* We would like to flatten the linked list of midgard_instructions in a bundle
 737  * to an array of pointers on the heap for easy indexing */
 738
 739 static midgard_instruction **
 740 flatten_mir(midgard_block *block, unsigned *len)
 741 {
 742         *len = list_length(&block->instructions);
 743
 744         if (!(*len))
 745                 return NULL;
 746
 747         midgard_instruction **instructions =
 748                 calloc(sizeof(midgard_instruction *), *len);
 749
 750         unsigned i = 0;
 751
 752         mir_foreach_instr_in_block(block, ins)
 753                 instructions[i++] = ins;
 754
 755         return instructions;
 756 }
 757
 758 /* The worklist is the set of instructions that can be scheduled now; that is,
 759  * the set of instructions with no remaining dependencies */
 760
 761 static void
 762 mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count)
 763 {
 764         for (unsigned i = 0; i < count; ++i) {
 765                 if (instructions[i]->nr_dependencies == 0)
 766                         BITSET_SET(worklist, i);
 767         }
 768 }
 769
 770 /* Update the worklist after an instruction terminates. Remove its edges from
 771  * the graph and if that causes any node to have no dependencies, add it to the
 772  * worklist */
 773
 774 static void
 775 mir_update_worklist(
 776                 BITSET_WORD *worklist, unsigned count,
 777                 midgard_instruction **instructions, midgard_instruction *done)
 778 {
 779         /* Sanity check: if no instruction terminated, there is nothing to do.
 780          * If the instruction that terminated had dependencies, that makes no
 781          * sense and means we messed up the worklist. Finally, as the purpose
 782          * of this routine is to update dependents, we abort early if there are
 783          * no dependents defined. */
 784
 785         if (!done)
 786                 return;
 787
 788         assert(done->nr_dependencies == 0);
 789
 790         if (!done->dependents)
 791                 return;
 792
 793         /* We have an instruction with dependents. Iterate each dependent to
 794          * remove one dependency (`done`), adding dependents to the worklist
 795          * where possible. */
 796
 797         unsigned i;
 798         BITSET_WORD tmp;
 799         BITSET_FOREACH_SET(i, tmp, done->dependents, count) {
 800                 assert(instructions[i]->nr_dependencies);
 801
 802                 if (!(--instructions[i]->nr_dependencies))
 803                         BITSET_SET(worklist, i);
 804         }
 805
 806         free(done->dependents);
 807 }
 808
 809 /* While scheduling, we need to choose instructions satisfying certain
 810  * criteria. As we schedule backwards, we choose the *last* instruction in the
 811  * worklist to simulate in-order scheduling. Chosen instructions must satisfy a
 812  * given predicate. */
 813
 814 struct midgard_predicate {
 815         /* TAG or ~0 for dont-care */
 816         unsigned tag;
 817
 818         /* True if we want to pop off the chosen instruction */
 819         bool destructive;
 820 };
 821
 822 static midgard_instruction *
 823 mir_choose_instruction(
 824                 midgard_instruction **instructions,
 825                 BITSET_WORD *worklist, unsigned count,
 826                 struct midgard_predicate *predicate)
 827 {
 828         /* Parse the predicate */
 829         unsigned tag = predicate->tag;
 830
 831         /* Iterate to find the best instruction satisfying the predicate */
 832         unsigned i;
 833         BITSET_WORD tmp;
 834
 835         signed best_index = -1;
 836
 837         BITSET_FOREACH_SET(i, tmp, worklist, count) {
 838                 if (tag != ~0 && instructions[i]->type != tag)
 839                         continue;
 840
 841                 /* Simulate in-order scheduling */
 842                 if ((signed) i < best_index)
 843                         continue;
 844
 845                 best_index = i;
 846         }
 847
 848
 849         /* Did we find anything?  */
 850
 851         if (best_index < 0)
 852                 return NULL;
 853
 854         /* If we found something, remove it from the worklist */
 855         assert(best_index < count);
 856
 857         if (predicate->destructive) {
 858                 BITSET_CLEAR(worklist, best_index);
 859         }
 860
 861         return instructions[best_index];
 862 }
 863
 864 /* Still, we don't choose instructions in a vacuum. We need a way to choose the
 865  * best bundle type (ALU, load/store, texture). Nondestructive. */
 866
 867 static unsigned
 868 mir_choose_bundle(
 869                 midgard_instruction **instructions,
 870                 BITSET_WORD *worklist, unsigned count)
 871 {
 872         /* At the moment, our algorithm is very simple - use the bundle of the
 873          * best instruction, regardless of what else could be scheduled
 874          * alongside it. This is not optimal but it works okay for in-order */
 875
 876         struct midgard_predicate predicate = {
 877                 .tag = ~0,
 878                 .destructive = false
 879         };
 880
 881         midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate);
 882
 883         if (chosen)
 884                 return chosen->type;
 885         else
 886                 return ~0;
 887 }
 888
 889 /* Schedule a single block by iterating its instruction to create bundles.
 890  * While we go, tally about the bundle sizes to compute the block size. */
 891
 892 static void
 893 schedule_block(compiler_context *ctx, midgard_block *block)
 894 {
 895         /* Copy list to dynamic array */
 896         unsigned len = 0;
 897         midgard_instruction **instructions = flatten_mir(block, &len);
 898
 899         /* Calculate dependencies and initial worklist */
 900         unsigned node_count = ctx->temp_count + 1;
 901         mir_create_dependency_graph(instructions, len, node_count);
 902
 903         /* Allocate the worklist */
 904         size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
 905         BITSET_WORD *worklist = calloc(sz, 1);
 906         mir_initialize_worklist(worklist, instructions, len);
 907
 908         util_dynarray_init(&block->bundles, NULL);
 909
 910         block->quadword_count = 0;
 911
 912         int skip = 0;
 913         mir_foreach_instr_in_block(block, ins) {
 914                 if (skip) {
 915                         skip--;
 916                         continue;
 917                 }
 918
 919                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
 920                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
 921
 922                 if (bundle.has_blend_constant) {
 923                         unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
 924                         ctx->blend_constant_offset = offset * 0x10;
 925                 }
 926
 927                 block->quadword_count += quadword_size(bundle.tag);
 928         }
 929
 930         block->is_scheduled = true;
 931         ctx->quadword_count += block->quadword_count;
 932 }
 933
 934 /* The following passes reorder MIR instructions to enable better scheduling */
 935
 936 static void
 937 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
 938 {
 939         mir_foreach_instr_in_block_safe(block, ins) {
 940                 if (ins->type != TAG_LOAD_STORE_4) continue;
 941
 942                 /* We've found a load/store op. Check if next is also load/store. */
 943                 midgard_instruction *next_op = mir_next_op(ins);
 944                 if (&next_op->link != &block->instructions) {
 945                         if (next_op->type == TAG_LOAD_STORE_4) {
 946                                 /* If so, we're done since we're a pair */
 947                                 ins = mir_next_op(ins);
 948                                 continue;
 949                         }
 950
 951                         /* Maximum search distance to pair, to avoid register pressure disasters */
 952                         int search_distance = 8;
 953
 954                         /* Otherwise, we have an orphaned load/store -- search for another load */
 955                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
 956                                 /* Terminate search if necessary */
 957                                 if (!(search_distance--)) break;
 958
 959                                 if (c->type != TAG_LOAD_STORE_4) continue;
 960
 961                                 /* We can only reorder if there are no sources */
 962
 963                                 bool deps = false;
 964
 965                                 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
 966                                         deps |= (c->src[s] != ~0);
 967
 968                                 if (deps)
 969                                         continue;
 970
 971                                 /* We found one! Move it up to pair and remove it from the old location */
 972
 973                                 mir_insert_instruction_before(ctx, ins, *c);
 974                                 mir_remove_instruction(c);
 975
 976                                 break;
 977                         }
 978                 }
 979         }
 980 }
 981
 982 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 983  * as such */
 984
 985 static unsigned
 986 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 987 {
 988         if (hash >= SSA_FIXED_MINIMUM)
 989                 return hash;
 990
 991         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
 992                                 ctx->hash_to_temp, hash + 1);
 993
 994         if (temp)
 995                 return temp - 1;
 996
 997         /* If no temp is find, allocate one */
 998         temp = ctx->temp_count++;
 999         ctx->max_hash = MAX2(ctx->max_hash, hash);
1000
1001         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
1002                                     hash + 1, (void *) ((uintptr_t) temp + 1));
1003
1004         return temp;
1005 }
1006
1007 /* Reassigns numbering to get rid of gaps in the indices */
1008
1009 static void
1010 mir_squeeze_index(compiler_context *ctx)
1011 {
1012         /* Reset */
1013         ctx->temp_count = 0;
1014         /* TODO don't leak old hash_to_temp */
1015         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
1016
1017         mir_foreach_instr_global(ctx, ins) {
1018                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
1019
1020                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
1021                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
1022         }
1023 }
1024
1025 static midgard_instruction
1026 v_load_store_scratch(
1027                 unsigned srcdest,
1028                 unsigned index,
1029                 bool is_store,
1030                 unsigned mask)
1031 {
1032         /* We index by 32-bit vec4s */
1033         unsigned byte = (index * 4 * 4);
1034
1035         midgard_instruction ins = {
1036                 .type = TAG_LOAD_STORE_4,
1037                 .mask = mask,
1038                 .dest = ~0,
1039                 .src = { ~0, ~0, ~0 },
1040                 .load_store = {
1041                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
1042                         .swizzle = SWIZZLE_XYZW,
1043
1044                         /* For register spilling - to thread local storage */
1045                         .arg_1 = 0xEA,
1046                         .arg_2 = 0x1E,
1047
1048                         /* Splattered across, TODO combine logically */
1049                         .varying_parameters = (byte & 0x1FF) << 1,
1050                         .address = (byte >> 9)
1051                 },
1052
1053                 /* If we spill an unspill, RA goes into an infinite loop */
1054                 .no_spill = true
1055         };
1056
1057        if (is_store) {
1058                 /* r0 = r26, r1 = r27 */
1059                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
1060                 ins.src[0] = srcdest;
1061         } else {
1062                 ins.dest = srcdest;
1063         }
1064
1065         return ins;
1066 }
1067
1068 /* If register allocation fails, find the best spill node and spill it to fix
1069  * whatever the issue was. This spill node could be a work register (spilling
1070  * to thread local storage), but it could also simply be a special register
1071  * that needs to spill to become a work register. */
1072
1073 static void mir_spill_register(
1074                 compiler_context *ctx,
1075                 struct ra_graph *g,
1076                 unsigned *spill_count)
1077 {
1078         unsigned spill_index = ctx->temp_count;
1079
1080         /* Our first step is to calculate spill cost to figure out the best
1081          * spill node. All nodes are equal in spill cost, but we can't spill
1082          * nodes written to from an unspill */
1083
1084         for (unsigned i = 0; i < ctx->temp_count; ++i) {
1085                 ra_set_node_spill_cost(g, i, 1.0);
1086         }
1087
1088         /* We can't spill any bundles that contain unspills. This could be
1089          * optimized to allow use of r27 to spill twice per bundle, but if
1090          * you're at the point of optimizing spilling, it's too late. */
1091
1092         mir_foreach_block(ctx, block) {
1093                 mir_foreach_bundle_in_block(block, bun) {
1094                         bool no_spill = false;
1095
1096                         for (unsigned i = 0; i < bun->instruction_count; ++i)
1097                                 no_spill |= bun->instructions[i]->no_spill;
1098
1099                         if (!no_spill)
1100                                 continue;
1101
1102                         for (unsigned i = 0; i < bun->instruction_count; ++i) {
1103                                 unsigned dest = bun->instructions[i]->dest;
1104                                 if (dest < ctx->temp_count)
1105                                         ra_set_node_spill_cost(g, dest, -1.0);
1106                         }
1107                 }
1108         }
1109
1110         int spill_node = ra_get_best_spill_node(g);
1111
1112         if (spill_node < 0) {
1113                 mir_print_shader(ctx);
1114                 assert(0);
1115         }
1116
1117         /* We have a spill node, so check the class. Work registers
1118          * legitimately spill to TLS, but special registers just spill to work
1119          * registers */
1120
1121         unsigned class = ra_get_node_class(g, spill_node);
1122         bool is_special = (class >> 2) != REG_CLASS_WORK;
1123         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
1124
1125         /* Allocate TLS slot (maybe) */
1126         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
1127
1128         /* For TLS, replace all stores to the spilled node. For
1129          * special reads, just keep as-is; the class will be demoted
1130          * implicitly. For special writes, spill to a work register */
1131
1132         if (!is_special || is_special_w) {
1133                 if (is_special_w)
1134                         spill_slot = spill_index++;
1135
1136                 mir_foreach_block(ctx, block) {
1137                 mir_foreach_instr_in_block_safe(block, ins) {
1138                         if (ins->dest != spill_node) continue;
1139
1140                         midgard_instruction st;
1141
1142                         if (is_special_w) {
1143                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
1144                                 st.no_spill = true;
1145                         } else {
1146                                 ins->dest = SSA_FIXED_REGISTER(26);
1147                                 ins->no_spill = true;
1148                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
1149                         }
1150
1151                         /* Hint: don't rewrite this node */
1152                         st.hint = true;
1153
1154                         mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1155
1156                         if (!is_special)
1157                                 ctx->spills++;
1158                 }
1159                 }
1160         }
1161
1162         /* For special reads, figure out how many components we need */
1163         unsigned read_mask = 0;
1164
1165         mir_foreach_instr_global_safe(ctx, ins) {
1166                 read_mask |= mir_mask_of_read_components(ins, spill_node);
1167         }
1168
1169         /* Insert a load from TLS before the first consecutive
1170          * use of the node, rewriting to use spilled indices to
1171          * break up the live range. Or, for special, insert a
1172          * move. Ironically the latter *increases* register
1173          * pressure, but the two uses of the spilling mechanism
1174          * are somewhat orthogonal. (special spilling is to use
1175          * work registers to back special registers; TLS
1176          * spilling is to use memory to back work registers) */
1177
1178         mir_foreach_block(ctx, block) {
1179                 bool consecutive_skip = false;
1180                 unsigned consecutive_index = 0;
1181
1182                 mir_foreach_instr_in_block(block, ins) {
1183                         /* We can't rewrite the moves used to spill in the
1184                          * first place. These moves are hinted. */
1185                         if (ins->hint) continue;
1186
1187                         if (!mir_has_arg(ins, spill_node)) {
1188                                 consecutive_skip = false;
1189                                 continue;
1190                         }
1191
1192                         if (consecutive_skip) {
1193                                 /* Rewrite */
1194                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1195                                 continue;
1196                         }
1197
1198                         if (!is_special_w) {
1199                                 consecutive_index = ++spill_index;
1200
1201                                 midgard_instruction *before = ins;
1202
1203                                 /* For a csel, go back one more not to break up the bundle */
1204                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
1205                                         before = mir_prev_op(before);
1206
1207                                 midgard_instruction st;
1208
1209                                 if (is_special) {
1210                                         /* Move */
1211                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
1212                                         st.no_spill = true;
1213                                 } else {
1214                                         /* TLS load */
1215                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
1216                                 }
1217
1218                                 /* Mask the load based on the component count
1219                                  * actually needed to prvent RA loops */
1220
1221                                 st.mask = read_mask;
1222
1223                                 mir_insert_instruction_before_scheduled(ctx, block, before, st);
1224                                // consecutive_skip = true;
1225                         } else {
1226                                 /* Special writes already have their move spilled in */
1227                                 consecutive_index = spill_slot;
1228                         }
1229
1230
1231                         /* Rewrite to use */
1232                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1233
1234                         if (!is_special)
1235                                 ctx->fills++;
1236                 }
1237         }
1238
1239         /* Reset hints */
1240
1241         mir_foreach_instr_global(ctx, ins) {
1242                 ins->hint = false;
1243         }
1244 }
1245
1246 void
1247 schedule_program(compiler_context *ctx)
1248 {
1249         struct ra_graph *g = NULL;
1250         bool spilled = false;
1251         int iter_count = 1000; /* max iterations */
1252
1253         /* Number of 128-bit slots in memory we've spilled into */
1254         unsigned spill_count = 0;
1255
1256         midgard_promote_uniforms(ctx, 16);
1257
1258         mir_foreach_block(ctx, block) {
1259                 midgard_pair_load_store(ctx, block);
1260         }
1261
1262         /* Must be lowered right before RA */
1263         mir_squeeze_index(ctx);
1264         mir_lower_special_reads(ctx);
1265         mir_squeeze_index(ctx);
1266
1267         /* Lowering can introduce some dead moves */
1268
1269         mir_foreach_block(ctx, block) {
1270                 midgard_opt_dead_move_eliminate(ctx, block);
1271                 schedule_block(ctx, block);
1272         }
1273
1274         mir_create_pipeline_registers(ctx);
1275
1276         do {
1277                 if (spilled)
1278                         mir_spill_register(ctx, g, &spill_count);
1279
1280                 mir_squeeze_index(ctx);
1281
1282                 g = NULL;
1283                 g = allocate_registers(ctx, &spilled);
1284         } while(spilled && ((iter_count--) > 0));
1285
1286         if (iter_count <= 0) {
1287                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1288                 assert(0);
1289         }
1290
1291         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1292          * fp32), but tls_size is in bytes, so multiply by 16 */
1293
1294         ctx->tls_size = spill_count * 16;
1295
1296         install_registers(ctx, g);
1297 }