src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
  30  * must be grouped into VLIW bundles according to following model:
  31  *
  32  * [VMUL] [SADD]
  33  * [VADD] [SMUL] [VLUT]
  34  *
  35  * A given instruction can execute on some subset of the units (or a few can
  36  * execute on all). Instructions can be either vector or scalar; only scalar
  37  * instructions can execute on SADD/SMUL units. Units on a given line execute
  38  * in parallel. Subsequent lines execute separately and can pass results
  39  * directly via pipeline registers r24/r25, bypassing the register file.
  40  *
  41  * A bundle can optionally have 128-bits of embedded constants, shared across
  42  * all of the instructions within a bundle.
  43  *
  44  * Instructions consuming conditionals (branches and conditional selects)
  45  * require their condition to be written into the conditional register (r31)
  46  * within the same bundle they are consumed.
  47  *
  48  * Fragment writeout requires its argument to be written in full within the
  49  * same bundle as the branch, with no hanging dependencies.
  50  *
  51  * Load/store instructions are also in bundles of simply two instructions, and
  52  * texture instructions have no bundling.
  53  *
  54  * -------------------------------------------------------------------------
  55  *
  56  */
  57
  58 /* We create the dependency graph with per-component granularity */
  59
  60 #define COMPONENT_COUNT 8
  61
  62 static void
  63 add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child)
  64 {
  65         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  66                 if (!(mask & (1 << i)))
  67                         continue;
  68
  69                 struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i];
  70
  71                 util_dynarray_foreach(parents, unsigned, parent) {
  72                         BITSET_WORD *dependents = instructions[*parent]->dependents;
  73
  74                         /* Already have the dependency */
  75                         if (BITSET_TEST(dependents, child))
  76                                 continue;
  77
  78                         BITSET_SET(dependents, child);
  79                         instructions[child]->nr_dependencies++;
  80                 }
  81         }
  82 }
  83
  84 static void
  85 mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent)
  86 {
  87         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  88                 if (!(mask & (1 << i)))
  89                         continue;
  90
  91                 util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent);
  92         }
  93 }
  94
  95 static void
  96 mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)
  97 {
  98         size_t sz = node_count * COMPONENT_COUNT;
  99
 100         struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
 101         struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
 102
 103         for (unsigned i = 0; i < sz; ++i) {
 104                 util_dynarray_init(&last_read[i], NULL);
 105                 util_dynarray_init(&last_write[i], NULL);
 106         }
 107
 108         /* Initialize dependency graph */
 109         for (unsigned i = 0; i < count; ++i) {
 110                 instructions[i]->dependents =
 111                         calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));
 112
 113                 instructions[i]->nr_dependencies = 0;
 114         }
 115
 116         /* Populate dependency graph */
 117         for (signed i = count - 1; i >= 0; --i) {
 118                 if (instructions[i]->compact_branch)
 119                         continue;
 120
 121                 unsigned dest = instructions[i]->dest;
 122                 unsigned mask = instructions[i]->mask;
 123
 124                 mir_foreach_src((*instructions), s) {
 125                         unsigned src = instructions[i]->src[s];
 126
 127                         if (src < node_count) {
 128                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 129                                 add_dependency(last_write, src, readmask, instructions, i);
 130                         }
 131                 }
 132
 133                 if (dest < node_count) {
 134                         add_dependency(last_read, dest, mask, instructions, i);
 135                         add_dependency(last_write, dest, mask, instructions, i);
 136                         mark_access(last_write, dest, mask, i);
 137                 }
 138
 139                 mir_foreach_src((*instructions), s) {
 140                         unsigned src = instructions[i]->src[s];
 141
 142                         if (src < node_count) {
 143                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 144                                 mark_access(last_read, src, readmask, i);
 145                         }
 146                 }
 147         }
 148
 149         /* If there is a branch, all instructions depend on it, as interblock
 150          * execution must be purely in-order */
 151
 152         if (instructions[count - 1]->compact_branch) {
 153                 BITSET_WORD *dependents = instructions[count - 1]->dependents;
 154
 155                 for (signed i = count - 2; i >= 0; --i) {
 156                         if (BITSET_TEST(dependents, i))
 157                                 continue;
 158
 159                         BITSET_SET(dependents, i);
 160                         instructions[i]->nr_dependencies++;
 161                 }
 162         }
 163
 164         /* Free the intermediate structures */
 165         for (unsigned i = 0; i < sz; ++i) {
 166                 util_dynarray_fini(&last_read[i]);
 167                 util_dynarray_fini(&last_write[i]);
 168         }
 169 }
 170
 171 /* Create a mask of accessed components from a swizzle to figure out vector
 172  * dependencies */
 173
 174 static unsigned
 175 swizzle_to_access_mask(unsigned swizzle)
 176 {
 177         unsigned component_mask = 0;
 178
 179         for (int i = 0; i < 4; ++i) {
 180                 unsigned c = (swizzle >> (2 * i)) & 3;
 181                 component_mask |= (1 << c);
 182         }
 183
 184         return component_mask;
 185 }
 186
 187 /* Does the mask cover more than a scalar? */
 188
 189 static bool
 190 is_single_component_mask(unsigned mask)
 191 {
 192         int components = 0;
 193
 194         for (int c = 0; c < 8; ++c) {
 195                 if (mask & (1 << c))
 196                         components++;
 197         }
 198
 199         return components == 1;
 200 }
 201
 202 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
 203  * mind that we are a vector architecture and we can write to different
 204  * components simultaneously */
 205
 206 static bool
 207 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
 208 {
 209         /* Writeout has its own rules anyway */
 210         if (first->compact_branch || second->compact_branch)
 211                 return true;
 212
 213         /* Each instruction reads some registers and writes to a register. See
 214          * where the first writes */
 215
 216         int source = first->dest;
 217         int source_mask = first->mask;
 218
 219         /* As long as the second doesn't read from the first, we're okay */
 220         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
 221                 if (second->src[i] != source)
 222                         continue;
 223
 224                 if (first->type != TAG_ALU_4)
 225                         return false;
 226
 227                 /* Figure out which components we just read from */
 228
 229                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
 230                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
 231
 232                 /* Check if there are components in common, and fail if so */
 233                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
 234                         return false;
 235         }
 236
 237         /* Otherwise, it's safe in that regard. Another data hazard is both
 238          * writing to the same place, of course */
 239
 240         if (second->dest == source) {
 241                 /* ...but only if the components overlap */
 242
 243                 if (second->mask & source_mask)
 244                         return false;
 245         }
 246
 247         /* ...That's it */
 248         return true;
 249 }
 250
 251 static bool
 252 midgard_has_hazard(
 253         midgard_instruction **segment, unsigned segment_size,
 254         midgard_instruction *ains)
 255 {
 256         for (int s = 0; s < segment_size; ++s)
 257                 if (!can_run_concurrent_ssa(segment[s], ains))
 258                         return true;
 259
 260         return false;
 261
 262
 263 }
 264
 265 /* Fragment writeout (of r0) is allowed when:
 266  *
 267  *  - All components of r0 are written in the bundle
 268  *  - No components of r0 are written in VLUT
 269  *  - Non-pipelined dependencies of r0 are not written in the bundle
 270  *
 271  * This function checks if these requirements are satisfied given the content
 272  * of a scheduled bundle.
 273  */
 274
 275 static bool
 276 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count, unsigned r0)
 277 {
 278         /* First scan for which components of r0 are written out. Initially
 279          * none are written */
 280
 281         uint8_t r0_written_mask = 0x0;
 282
 283         /* Simultaneously we scan for the set of dependencies */
 284
 285         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 286         BITSET_WORD *dependencies = calloc(1, sz);
 287         memset(dependencies, 0, sz);
 288
 289         bool success = false;
 290
 291         for (unsigned i = 0; i < count; ++i) {
 292                 midgard_instruction *ins = bundle[i];
 293
 294                 if (ins->dest != r0)
 295                         continue;
 296
 297                 /* Record written out mask */
 298                 r0_written_mask |= ins->mask;
 299
 300                 /* Record dependencies, but only if they won't become pipeline
 301                  * registers. We know we can't be live after this, because
 302                  * we're writeout at the very end of the shader. So check if
 303                  * they were written before us. */
 304
 305                 unsigned src0 = ins->src[0];
 306                 unsigned src1 = ins->src[1];
 307
 308                 if (!mir_is_written_before(ctx, bundle[0], src0))
 309                         src0 = ~0;
 310
 311                 if (!mir_is_written_before(ctx, bundle[0], src1))
 312                         src1 = ~0;
 313
 314                 if (src0 < node_count)
 315                         BITSET_SET(dependencies, src0);
 316
 317                 if (src1 < node_count)
 318                         BITSET_SET(dependencies, src1);
 319
 320                 /* Requirement 2 */
 321                 if (ins->unit == UNIT_VLUT)
 322                         goto done;
 323         }
 324
 325         /* Requirement 1 */
 326         if ((r0_written_mask & 0xF) != 0xF)
 327                 goto done;
 328
 329         /* Requirement 3 */
 330
 331         for (unsigned i = 0; i < count; ++i) {
 332                 unsigned dest = bundle[i]->dest;
 333
 334                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 335                         goto done;
 336         }
 337
 338         /* Otherwise, we're good to go */
 339         success = true;
 340
 341 done:
 342         free(dependencies);
 343         return success;
 344 }
 345
 346 /* Helpers for scheudling */
 347
 348 static bool
 349 mir_is_scalar(midgard_instruction *ains)
 350 {
 351         /* Does the op support scalar units? */
 352         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 353                 return false;
 354
 355         /* Do we try to use it as a vector op? */
 356         if (!is_single_component_mask(ains->mask))
 357                 return false;
 358
 359         /* Otherwise, check mode hazards */
 360         bool could_scalar = true;
 361
 362         /* Only 16/32-bit can run on a scalar unit */
 363         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 364         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 365         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 366
 367         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 368                 /* If we're running in 16-bit mode, we
 369                  * can't have any 8-bit sources on the
 370                  * scalar unit (since the scalar unit
 371                  * doesn't understand 8-bit) */
 372
 373                 midgard_vector_alu_src s1 =
 374                         vector_alu_from_unsigned(ains->alu.src1);
 375
 376                 could_scalar &= !s1.half;
 377
 378                 midgard_vector_alu_src s2 =
 379                         vector_alu_from_unsigned(ains->alu.src2);
 380
 381                 could_scalar &= !s2.half;
 382         }
 383
 384         return could_scalar;
 385 }
 386
 387 /* How many bytes does this ALU instruction add to the bundle? */
 388
 389 static unsigned
 390 bytes_for_instruction(midgard_instruction *ains)
 391 {
 392         if (ains->unit & UNITS_ANY_VECTOR)
 393                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 394         else if (ains->unit == ALU_ENAB_BRANCH)
 395                 return sizeof(midgard_branch_extended);
 396         else if (ains->compact_branch)
 397                 return sizeof(ains->br_compact);
 398         else
 399                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 400 }
 401
 402 /* Schedules, but does not emit, a single basic block. After scheduling, the
 403  * final tag and size of the block are known, which are necessary for branching
 404  * */
 405
 406 static midgard_bundle
 407 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 408 {
 409         int instructions_emitted = 0, packed_idx = 0;
 410         midgard_bundle bundle = { 0 };
 411
 412         midgard_instruction *scheduled[5] = { NULL };
 413
 414         uint8_t tag = ins->type;
 415
 416         /* Default to the instruction's tag */
 417         bundle.tag = tag;
 418
 419         switch (ins->type) {
 420         case TAG_ALU_4: {
 421                 uint32_t control = 0;
 422                 size_t bytes_emitted = sizeof(control);
 423
 424                 /* TODO: Constant combining */
 425                 int index = 0, last_unit = 0;
 426
 427                 /* Previous instructions, for the purpose of parallelism */
 428                 midgard_instruction *segment[4] = {0};
 429                 int segment_size = 0;
 430
 431                 instructions_emitted = -1;
 432                 midgard_instruction *pins = ins;
 433
 434                 unsigned constant_count = 0;
 435
 436                 for (;;) {
 437                         midgard_instruction *ains = pins;
 438
 439                         /* Advance instruction pointer */
 440                         if (index) {
 441                                 ains = mir_next_op(pins);
 442                                 pins = ains;
 443                         }
 444
 445                         /* Out-of-work condition */
 446                         if ((struct list_head *) ains == &block->instructions)
 447                                 break;
 448
 449                         /* Ensure that the chain can continue */
 450                         if (ains->type != TAG_ALU_4) break;
 451
 452                         /* If there's already something in the bundle and we
 453                          * have weird scheduler constraints, break now */
 454                         if (ains->precede_break && index) break;
 455
 456                         /* According to the presentation "The ARM
 457                          * Mali-T880 Mobile GPU" from HotChips 27,
 458                          * there are two pipeline stages. Branching
 459                          * position determined experimentally. Lines
 460                          * are executed in parallel:
 461                          *
 462                          * [ VMUL ] [ SADD ]
 463                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 464                          *
 465                          * Verify that there are no ordering dependencies here.
 466                          *
 467                          * TODO: Allow for parallelism!!!
 468                          */
 469
 470                         /* Pick a unit for it if it doesn't force a particular unit */
 471
 472                         int unit = ains->unit;
 473
 474                         if (!unit) {
 475                                 int op = ains->alu.op;
 476                                 int units = alu_opcode_props[op].props;
 477                                 bool scalar = mir_is_scalar(ains);
 478
 479                                 if (!scalar) {
 480                                         if (last_unit >= UNIT_VADD) {
 481                                                 if (units & UNIT_VLUT)
 482                                                         unit = UNIT_VLUT;
 483                                                 else
 484                                                         break;
 485                                         } else {
 486                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 487                                                         unit = UNIT_VMUL;
 488                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 489                                                         unit = UNIT_VADD;
 490                                                 else if (units & UNIT_VLUT)
 491                                                         unit = UNIT_VLUT;
 492                                                 else
 493                                                         break;
 494                                         }
 495                                 } else {
 496                                         if (last_unit >= UNIT_VADD) {
 497                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 498                                                         unit = UNIT_SMUL;
 499                                                 else if (units & UNIT_VLUT)
 500                                                         unit = UNIT_VLUT;
 501                                                 else
 502                                                         break;
 503                                         } else {
 504                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 505                                                         unit = UNIT_VMUL;
 506                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 507                                                         unit = UNIT_SADD;
 508                                                 else if (units & UNIT_VADD)
 509                                                         unit = UNIT_VADD;
 510                                                 else if (units & UNIT_SMUL)
 511                                                         unit = UNIT_SMUL;
 512                                                 else if (units & UNIT_VLUT)
 513                                                         unit = UNIT_VLUT;
 514                                                 else
 515                                                         break;
 516                                         }
 517                                 }
 518
 519                                 assert(unit & units);
 520                         }
 521
 522                         /* Late unit check, this time for encoding (not parallelism) */
 523                         if (unit <= last_unit) break;
 524
 525                         /* Clear the segment */
 526                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 527                                 segment_size = 0;
 528
 529                         if (midgard_has_hazard(segment, segment_size, ains))
 530                                 break;
 531
 532                         /* We're good to go -- emit the instruction */
 533                         ains->unit = unit;
 534
 535                         segment[segment_size++] = ains;
 536
 537                         /* We try to reuse constants if possible, by adjusting
 538                          * the swizzle */
 539
 540                         if (ains->has_blend_constant) {
 541                                 /* Everything conflicts with the blend constant */
 542                                 if (bundle.has_embedded_constants)
 543                                         break;
 544
 545                                 bundle.has_blend_constant = 1;
 546                                 bundle.has_embedded_constants = 1;
 547                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 548                                 /* TODO: DRY with the analysis pass */
 549
 550                                 if (bundle.has_blend_constant)
 551                                         break;
 552
 553                                 if (constant_count)
 554                                         break;
 555
 556                                 /* TODO: Fix packing XXX */
 557                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 558                                 uint32_t *constants = (uint32_t *) ains->constants;
 559
 560                                 /* Copy them wholesale */
 561                                 for (unsigned i = 0; i < 4; ++i)
 562                                         bundles[i] = constants[i];
 563
 564                                 bundle.has_embedded_constants = true;
 565                                 constant_count = 4;
 566                         } else if (ains->has_constants) {
 567                                 /* By definition, blend constants conflict with
 568                                  * everything, so if there are already
 569                                  * constants we break the bundle *now* */
 570
 571                                 if (bundle.has_blend_constant)
 572                                         break;
 573
 574                                 /* For anything but blend constants, we can do
 575                                  * proper analysis, however */
 576
 577                                 /* TODO: Mask by which are used */
 578                                 uint32_t *constants = (uint32_t *) ains->constants;
 579                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 580
 581                                 uint32_t indices[4] = { 0 };
 582                                 bool break_bundle = false;
 583
 584                                 for (unsigned i = 0; i < 4; ++i) {
 585                                         uint32_t cons = constants[i];
 586                                         bool constant_found = false;
 587
 588                                         /* Search for the constant */
 589                                         for (unsigned j = 0; j < constant_count; ++j) {
 590                                                 if (bundles[j] != cons)
 591                                                         continue;
 592
 593                                                 /* We found it, reuse */
 594                                                 indices[i] = j;
 595                                                 constant_found = true;
 596                                                 break;
 597                                         }
 598
 599                                         if (constant_found)
 600                                                 continue;
 601
 602                                         /* We didn't find it, so allocate it */
 603                                         unsigned idx = constant_count++;
 604
 605                                         if (idx >= 4) {
 606                                                 /* Uh-oh, out of space */
 607                                                 break_bundle = true;
 608                                                 break;
 609                                         }
 610
 611                                         /* We have space, copy it in! */
 612                                         bundles[idx] = cons;
 613                                         indices[i] = idx;
 614                                 }
 615
 616                                 if (break_bundle)
 617                                         break;
 618
 619                                 /* Cool, we have it in. So use indices as a
 620                                  * swizzle */
 621
 622                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 623                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 624
 625                                 if (ains->src[0] == r_constant)
 626                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 627
 628                                 if (ains->src[1] == r_constant)
 629                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 630
 631                                 bundle.has_embedded_constants = true;
 632                         }
 633
 634                         if (ains->compact_branch) {
 635                                 /* All of r0 has to be written out along with
 636                                  * the branch writeout */
 637
 638                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count, ains->src[0])) {
 639                                         /* We only work on full moves
 640                                          * at the beginning. We could
 641                                          * probably do better */
 642                                         if (index != 0)
 643                                                 break;
 644
 645                                         /* Inject a move */
 646                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 647                                         ins.unit = UNIT_VMUL;
 648                                         control |= ins.unit;
 649
 650                                         /* TODO don't leak */
 651                                         midgard_instruction *move =
 652                                                 mem_dup(&ins, sizeof(midgard_instruction));
 653                                         bytes_emitted += bytes_for_instruction(move);
 654                                         bundle.instructions[packed_idx++] = move;
 655                                 }
 656                         }
 657
 658                         bytes_emitted += bytes_for_instruction(ains);
 659
 660                         /* Defer marking until after writing to allow for break */
 661                         scheduled[index] = ains;
 662                         control |= ains->unit;
 663                         last_unit = ains->unit;
 664                         ++instructions_emitted;
 665                         ++index;
 666                 }
 667
 668                 int padding = 0;
 669
 670                 /* Pad ALU op to nearest word */
 671
 672                 if (bytes_emitted & 15) {
 673                         padding = 16 - (bytes_emitted & 15);
 674                         bytes_emitted += padding;
 675                 }
 676
 677                 /* Constants must always be quadwords */
 678                 if (bundle.has_embedded_constants)
 679                         bytes_emitted += 16;
 680
 681                 /* Size ALU instruction for tag */
 682                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 683                 bundle.padding = padding;
 684                 bundle.control = bundle.tag | control;
 685
 686                 break;
 687         }
 688
 689         case TAG_LOAD_STORE_4: {
 690                 /* Load store instructions have two words at once. If
 691                  * we only have one queued up, we need to NOP pad.
 692                  * Otherwise, we store both in succession to save space
 693                  * and cycles -- letting them go in parallel -- skip
 694                  * the next. The usefulness of this optimisation is
 695                  * greatly dependent on the quality of the instruction
 696                  * scheduler.
 697                  */
 698
 699                 midgard_instruction *next_op = mir_next_op(ins);
 700
 701                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 702                         /* TODO: Concurrency check */
 703                         instructions_emitted++;
 704                 }
 705
 706                 break;
 707         }
 708
 709         case TAG_TEXTURE_4: {
 710                 /* Which tag we use depends on the shader stage */
 711                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 712                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 713                 break;
 714         }
 715
 716         default:
 717                 unreachable("Unknown tag");
 718                 break;
 719         }
 720
 721         /* Copy the instructions into the bundle */
 722         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 723
 724         midgard_instruction *uins = ins;
 725         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 726                 assert(&uins->link != &block->instructions);
 727                 bundle.instructions[packed_idx] = uins;
 728                 uins = mir_next_op(uins);
 729         }
 730
 731         *skip = instructions_emitted;
 732
 733         return bundle;
 734 }
 735
 736 /* We would like to flatten the linked list of midgard_instructions in a bundle
 737  * to an array of pointers on the heap for easy indexing */
 738
 739 static midgard_instruction **
 740 flatten_mir(midgard_block *block, unsigned *len)
 741 {
 742         *len = list_length(&block->instructions);
 743
 744         if (!(*len))
 745                 return NULL;
 746
 747         midgard_instruction **instructions =
 748                 calloc(sizeof(midgard_instruction *), *len);
 749
 750         unsigned i = 0;
 751
 752         mir_foreach_instr_in_block(block, ins)
 753                 instructions[i++] = ins;
 754
 755         return instructions;
 756 }
 757
 758 /* The worklist is the set of instructions that can be scheduled now; that is,
 759  * the set of instructions with no remaining dependencies */
 760
 761 static void
 762 mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count)
 763 {
 764         for (unsigned i = 0; i < count; ++i) {
 765                 if (instructions[i]->nr_dependencies == 0)
 766                         BITSET_SET(worklist, i);
 767         }
 768 }
 769
 770 /* While scheduling, we need to choose instructions satisfying certain
 771  * criteria. As we schedule backwards, we choose the *last* instruction in the
 772  * worklist to simulate in-order scheduling. Chosen instructions must satisfy a
 773  * given predicate. */
 774
 775 struct midgard_predicate {
 776         /* TAG or ~0 for dont-care */
 777         unsigned tag;
 778
 779         /* True if we want to pop off the chosen instruction */
 780         bool destructive;
 781 };
 782
 783 static midgard_instruction *
 784 mir_choose_instruction(
 785                 midgard_instruction **instructions,
 786                 BITSET_WORD *worklist, unsigned count,
 787                 struct midgard_predicate *predicate)
 788 {
 789         /* Parse the predicate */
 790         unsigned tag = predicate->tag;
 791
 792         /* Iterate to find the best instruction satisfying the predicate */
 793         unsigned i;
 794         BITSET_WORD tmp;
 795
 796         signed best_index = -1;
 797
 798         BITSET_FOREACH_SET(i, tmp, worklist, count) {
 799                 if (tag != ~0 && instructions[i]->type != tag)
 800                         continue;
 801
 802                 /* Simulate in-order scheduling */
 803                 if ((signed) i < best_index)
 804                         continue;
 805
 806                 best_index = i;
 807         }
 808
 809
 810         /* Did we find anything?  */
 811
 812         if (best_index < 0)
 813                 return NULL;
 814
 815         /* If we found something, remove it from the worklist */
 816         assert(best_index < count);
 817
 818         if (predicate->destructive) {
 819                 BITSET_CLEAR(worklist, best_index);
 820         }
 821
 822         return instructions[best_index];
 823 }
 824
 825 /* Schedule a single block by iterating its instruction to create bundles.
 826  * While we go, tally about the bundle sizes to compute the block size. */
 827
 828 static void
 829 schedule_block(compiler_context *ctx, midgard_block *block)
 830 {
 831         /* Copy list to dynamic array */
 832         unsigned len = 0;
 833         midgard_instruction **instructions = flatten_mir(block, &len);
 834
 835         /* Calculate dependencies and initial worklist */
 836         unsigned node_count = ctx->temp_count + 1;
 837         mir_create_dependency_graph(instructions, len, node_count);
 838
 839         /* Allocate the worklist */
 840         size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
 841         BITSET_WORD *worklist = calloc(sz, 1);
 842         mir_initialize_worklist(worklist, instructions, len);
 843
 844         util_dynarray_init(&block->bundles, NULL);
 845
 846         block->quadword_count = 0;
 847
 848         int skip = 0;
 849         mir_foreach_instr_in_block(block, ins) {
 850                 if (skip) {
 851                         skip--;
 852                         continue;
 853                 }
 854
 855                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
 856                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
 857
 858                 if (bundle.has_blend_constant) {
 859                         unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
 860                         ctx->blend_constant_offset = offset * 0x10;
 861                 }
 862
 863                 block->quadword_count += quadword_size(bundle.tag);
 864         }
 865
 866         block->is_scheduled = true;
 867         ctx->quadword_count += block->quadword_count;
 868 }
 869
 870 /* The following passes reorder MIR instructions to enable better scheduling */
 871
 872 static void
 873 midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
 874 {
 875         mir_foreach_instr_in_block_safe(block, ins) {
 876                 if (ins->type != TAG_LOAD_STORE_4) continue;
 877
 878                 /* We've found a load/store op. Check if next is also load/store. */
 879                 midgard_instruction *next_op = mir_next_op(ins);
 880                 if (&next_op->link != &block->instructions) {
 881                         if (next_op->type == TAG_LOAD_STORE_4) {
 882                                 /* If so, we're done since we're a pair */
 883                                 ins = mir_next_op(ins);
 884                                 continue;
 885                         }
 886
 887                         /* Maximum search distance to pair, to avoid register pressure disasters */
 888                         int search_distance = 8;
 889
 890                         /* Otherwise, we have an orphaned load/store -- search for another load */
 891                         mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
 892                                 /* Terminate search if necessary */
 893                                 if (!(search_distance--)) break;
 894
 895                                 if (c->type != TAG_LOAD_STORE_4) continue;
 896
 897                                 /* We can only reorder if there are no sources */
 898
 899                                 bool deps = false;
 900
 901                                 for (unsigned s = 0; s < ARRAY_SIZE(ins->src); ++s)
 902                                         deps |= (c->src[s] != ~0);
 903
 904                                 if (deps)
 905                                         continue;
 906
 907                                 /* We found one! Move it up to pair and remove it from the old location */
 908
 909                                 mir_insert_instruction_before(ctx, ins, *c);
 910                                 mir_remove_instruction(c);
 911
 912                                 break;
 913                         }
 914                 }
 915         }
 916 }
 917
 918 /* When we're 'squeezing down' the values in the IR, we maintain a hash
 919  * as such */
 920
 921 static unsigned
 922 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
 923 {
 924         if (hash >= SSA_FIXED_MINIMUM)
 925                 return hash;
 926
 927         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
 928                                 ctx->hash_to_temp, hash + 1);
 929
 930         if (temp)
 931                 return temp - 1;
 932
 933         /* If no temp is find, allocate one */
 934         temp = ctx->temp_count++;
 935         ctx->max_hash = MAX2(ctx->max_hash, hash);
 936
 937         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
 938                                     hash + 1, (void *) ((uintptr_t) temp + 1));
 939
 940         return temp;
 941 }
 942
 943 /* Reassigns numbering to get rid of gaps in the indices */
 944
 945 static void
 946 mir_squeeze_index(compiler_context *ctx)
 947 {
 948         /* Reset */
 949         ctx->temp_count = 0;
 950         /* TODO don't leak old hash_to_temp */
 951         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
 952
 953         mir_foreach_instr_global(ctx, ins) {
 954                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
 955
 956                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
 957                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
 958         }
 959 }
 960
 961 static midgard_instruction
 962 v_load_store_scratch(
 963                 unsigned srcdest,
 964                 unsigned index,
 965                 bool is_store,
 966                 unsigned mask)
 967 {
 968         /* We index by 32-bit vec4s */
 969         unsigned byte = (index * 4 * 4);
 970
 971         midgard_instruction ins = {
 972                 .type = TAG_LOAD_STORE_4,
 973                 .mask = mask,
 974                 .dest = ~0,
 975                 .src = { ~0, ~0, ~0 },
 976                 .load_store = {
 977                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
 978                         .swizzle = SWIZZLE_XYZW,
 979
 980                         /* For register spilling - to thread local storage */
 981                         .arg_1 = 0xEA,
 982                         .arg_2 = 0x1E,
 983
 984                         /* Splattered across, TODO combine logically */
 985                         .varying_parameters = (byte & 0x1FF) << 1,
 986                         .address = (byte >> 9)
 987                 },
 988
 989                 /* If we spill an unspill, RA goes into an infinite loop */
 990                 .no_spill = true
 991         };
 992
 993        if (is_store) {
 994                 /* r0 = r26, r1 = r27 */
 995                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
 996                 ins.src[0] = srcdest;
 997         } else {
 998                 ins.dest = srcdest;
 999         }
1000
1001         return ins;
1002 }
1003
1004 /* If register allocation fails, find the best spill node and spill it to fix
1005  * whatever the issue was. This spill node could be a work register (spilling
1006  * to thread local storage), but it could also simply be a special register
1007  * that needs to spill to become a work register. */
1008
1009 static void mir_spill_register(
1010                 compiler_context *ctx,
1011                 struct ra_graph *g,
1012                 unsigned *spill_count)
1013 {
1014         unsigned spill_index = ctx->temp_count;
1015
1016         /* Our first step is to calculate spill cost to figure out the best
1017          * spill node. All nodes are equal in spill cost, but we can't spill
1018          * nodes written to from an unspill */
1019
1020         for (unsigned i = 0; i < ctx->temp_count; ++i) {
1021                 ra_set_node_spill_cost(g, i, 1.0);
1022         }
1023
1024         /* We can't spill any bundles that contain unspills. This could be
1025          * optimized to allow use of r27 to spill twice per bundle, but if
1026          * you're at the point of optimizing spilling, it's too late. */
1027
1028         mir_foreach_block(ctx, block) {
1029                 mir_foreach_bundle_in_block(block, bun) {
1030                         bool no_spill = false;
1031
1032                         for (unsigned i = 0; i < bun->instruction_count; ++i)
1033                                 no_spill |= bun->instructions[i]->no_spill;
1034
1035                         if (!no_spill)
1036                                 continue;
1037
1038                         for (unsigned i = 0; i < bun->instruction_count; ++i) {
1039                                 unsigned dest = bun->instructions[i]->dest;
1040                                 if (dest < ctx->temp_count)
1041                                         ra_set_node_spill_cost(g, dest, -1.0);
1042                         }
1043                 }
1044         }
1045
1046         int spill_node = ra_get_best_spill_node(g);
1047
1048         if (spill_node < 0) {
1049                 mir_print_shader(ctx);
1050                 assert(0);
1051         }
1052
1053         /* We have a spill node, so check the class. Work registers
1054          * legitimately spill to TLS, but special registers just spill to work
1055          * registers */
1056
1057         unsigned class = ra_get_node_class(g, spill_node);
1058         bool is_special = (class >> 2) != REG_CLASS_WORK;
1059         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
1060
1061         /* Allocate TLS slot (maybe) */
1062         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
1063
1064         /* For TLS, replace all stores to the spilled node. For
1065          * special reads, just keep as-is; the class will be demoted
1066          * implicitly. For special writes, spill to a work register */
1067
1068         if (!is_special || is_special_w) {
1069                 if (is_special_w)
1070                         spill_slot = spill_index++;
1071
1072                 mir_foreach_block(ctx, block) {
1073                 mir_foreach_instr_in_block_safe(block, ins) {
1074                         if (ins->dest != spill_node) continue;
1075
1076                         midgard_instruction st;
1077
1078                         if (is_special_w) {
1079                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
1080                                 st.no_spill = true;
1081                         } else {
1082                                 ins->dest = SSA_FIXED_REGISTER(26);
1083                                 ins->no_spill = true;
1084                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
1085                         }
1086
1087                         /* Hint: don't rewrite this node */
1088                         st.hint = true;
1089
1090                         mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1091
1092                         if (!is_special)
1093                                 ctx->spills++;
1094                 }
1095                 }
1096         }
1097
1098         /* For special reads, figure out how many components we need */
1099         unsigned read_mask = 0;
1100
1101         mir_foreach_instr_global_safe(ctx, ins) {
1102                 read_mask |= mir_mask_of_read_components(ins, spill_node);
1103         }
1104
1105         /* Insert a load from TLS before the first consecutive
1106          * use of the node, rewriting to use spilled indices to
1107          * break up the live range. Or, for special, insert a
1108          * move. Ironically the latter *increases* register
1109          * pressure, but the two uses of the spilling mechanism
1110          * are somewhat orthogonal. (special spilling is to use
1111          * work registers to back special registers; TLS
1112          * spilling is to use memory to back work registers) */
1113
1114         mir_foreach_block(ctx, block) {
1115                 bool consecutive_skip = false;
1116                 unsigned consecutive_index = 0;
1117
1118                 mir_foreach_instr_in_block(block, ins) {
1119                         /* We can't rewrite the moves used to spill in the
1120                          * first place. These moves are hinted. */
1121                         if (ins->hint) continue;
1122
1123                         if (!mir_has_arg(ins, spill_node)) {
1124                                 consecutive_skip = false;
1125                                 continue;
1126                         }
1127
1128                         if (consecutive_skip) {
1129                                 /* Rewrite */
1130                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1131                                 continue;
1132                         }
1133
1134                         if (!is_special_w) {
1135                                 consecutive_index = ++spill_index;
1136
1137                                 midgard_instruction *before = ins;
1138
1139                                 /* For a csel, go back one more not to break up the bundle */
1140                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
1141                                         before = mir_prev_op(before);
1142
1143                                 midgard_instruction st;
1144
1145                                 if (is_special) {
1146                                         /* Move */
1147                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
1148                                         st.no_spill = true;
1149                                 } else {
1150                                         /* TLS load */
1151                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
1152                                 }
1153
1154                                 /* Mask the load based on the component count
1155                                  * actually needed to prvent RA loops */
1156
1157                                 st.mask = read_mask;
1158
1159                                 mir_insert_instruction_before_scheduled(ctx, block, before, st);
1160                                // consecutive_skip = true;
1161                         } else {
1162                                 /* Special writes already have their move spilled in */
1163                                 consecutive_index = spill_slot;
1164                         }
1165
1166
1167                         /* Rewrite to use */
1168                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1169
1170                         if (!is_special)
1171                                 ctx->fills++;
1172                 }
1173         }
1174
1175         /* Reset hints */
1176
1177         mir_foreach_instr_global(ctx, ins) {
1178                 ins->hint = false;
1179         }
1180 }
1181
1182 void
1183 schedule_program(compiler_context *ctx)
1184 {
1185         struct ra_graph *g = NULL;
1186         bool spilled = false;
1187         int iter_count = 1000; /* max iterations */
1188
1189         /* Number of 128-bit slots in memory we've spilled into */
1190         unsigned spill_count = 0;
1191
1192         midgard_promote_uniforms(ctx, 16);
1193
1194         mir_foreach_block(ctx, block) {
1195                 midgard_pair_load_store(ctx, block);
1196         }
1197
1198         /* Must be lowered right before RA */
1199         mir_squeeze_index(ctx);
1200         mir_lower_special_reads(ctx);
1201         mir_squeeze_index(ctx);
1202
1203         /* Lowering can introduce some dead moves */
1204
1205         mir_foreach_block(ctx, block) {
1206                 midgard_opt_dead_move_eliminate(ctx, block);
1207                 schedule_block(ctx, block);
1208         }
1209
1210         mir_create_pipeline_registers(ctx);
1211
1212         do {
1213                 if (spilled)
1214                         mir_spill_register(ctx, g, &spill_count);
1215
1216                 mir_squeeze_index(ctx);
1217
1218                 g = NULL;
1219                 g = allocate_registers(ctx, &spilled);
1220         } while(spilled && ((iter_count--) > 0));
1221
1222         if (iter_count <= 0) {
1223                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1224                 assert(0);
1225         }
1226
1227         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1228          * fp32), but tls_size is in bytes, so multiply by 16 */
1229
1230         ctx->tls_size = spill_count * 16;
1231
1232         install_registers(ctx, g);
1233 }