src/panfrost/midgard/midgard_schedule.c

   1 /*
   2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23
  24 #include "compiler.h"
  25 #include "midgard_ops.h"
  26 #include "util/u_memory.h"
  27 #include "util/register_allocate.h"
  28
  29 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
  30  * must be grouped into VLIW bundles according to following model:
  31  *
  32  * [VMUL] [SADD]
  33  * [VADD] [SMUL] [VLUT]
  34  *
  35  * A given instruction can execute on some subset of the units (or a few can
  36  * execute on all). Instructions can be either vector or scalar; only scalar
  37  * instructions can execute on SADD/SMUL units. Units on a given line execute
  38  * in parallel. Subsequent lines execute separately and can pass results
  39  * directly via pipeline registers r24/r25, bypassing the register file.
  40  *
  41  * A bundle can optionally have 128-bits of embedded constants, shared across
  42  * all of the instructions within a bundle.
  43  *
  44  * Instructions consuming conditionals (branches and conditional selects)
  45  * require their condition to be written into the conditional register (r31)
  46  * within the same bundle they are consumed.
  47  *
  48  * Fragment writeout requires its argument to be written in full within the
  49  * same bundle as the branch, with no hanging dependencies.
  50  *
  51  * Load/store instructions are also in bundles of simply two instructions, and
  52  * texture instructions have no bundling.
  53  *
  54  * -------------------------------------------------------------------------
  55  *
  56  */
  57
  58 /* We create the dependency graph with per-component granularity */
  59
  60 #define COMPONENT_COUNT 8
  61
  62 static void
  63 add_dependency(struct util_dynarray *table, unsigned index, unsigned mask, midgard_instruction **instructions, unsigned child)
  64 {
  65         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  66                 if (!(mask & (1 << i)))
  67                         continue;
  68
  69                 struct util_dynarray *parents = &table[(COMPONENT_COUNT * index) + i];
  70
  71                 util_dynarray_foreach(parents, unsigned, parent) {
  72                         BITSET_WORD *dependents = instructions[*parent]->dependents;
  73
  74                         /* Already have the dependency */
  75                         if (BITSET_TEST(dependents, child))
  76                                 continue;
  77
  78                         BITSET_SET(dependents, child);
  79                         instructions[child]->nr_dependencies++;
  80                 }
  81         }
  82 }
  83
  84 static void
  85 mark_access(struct util_dynarray *table, unsigned index, unsigned mask, unsigned parent)
  86 {
  87         for (unsigned i = 0; i < COMPONENT_COUNT; ++i) {
  88                 if (!(mask & (1 << i)))
  89                         continue;
  90
  91                 util_dynarray_append(&table[(COMPONENT_COUNT * index) + i], unsigned, parent);
  92         }
  93 }
  94
  95 static void
  96 mir_create_dependency_graph(midgard_instruction **instructions, unsigned count, unsigned node_count)
  97 {
  98         size_t sz = node_count * COMPONENT_COUNT;
  99
 100         struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
 101         struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
 102
 103         for (unsigned i = 0; i < sz; ++i) {
 104                 util_dynarray_init(&last_read[i], NULL);
 105                 util_dynarray_init(&last_write[i], NULL);
 106         }
 107
 108         /* Initialize dependency graph */
 109         for (unsigned i = 0; i < count; ++i) {
 110                 instructions[i]->dependents =
 111                         calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));
 112
 113                 instructions[i]->nr_dependencies = 0;
 114         }
 115
 116         /* Populate dependency graph */
 117         for (signed i = count - 1; i >= 0; --i) {
 118                 if (instructions[i]->compact_branch)
 119                         continue;
 120
 121                 unsigned dest = instructions[i]->dest;
 122                 unsigned mask = instructions[i]->mask;
 123
 124                 mir_foreach_src((*instructions), s) {
 125                         unsigned src = instructions[i]->src[s];
 126
 127                         if (src < node_count) {
 128                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 129                                 add_dependency(last_write, src, readmask, instructions, i);
 130                         }
 131                 }
 132
 133                 if (dest < node_count) {
 134                         add_dependency(last_read, dest, mask, instructions, i);
 135                         add_dependency(last_write, dest, mask, instructions, i);
 136                         mark_access(last_write, dest, mask, i);
 137                 }
 138
 139                 mir_foreach_src((*instructions), s) {
 140                         unsigned src = instructions[i]->src[s];
 141
 142                         if (src < node_count) {
 143                                 unsigned readmask = mir_mask_of_read_components(instructions[i], src);
 144                                 mark_access(last_read, src, readmask, i);
 145                         }
 146                 }
 147         }
 148
 149         /* If there is a branch, all instructions depend on it, as interblock
 150          * execution must be purely in-order */
 151
 152         if (instructions[count - 1]->compact_branch) {
 153                 BITSET_WORD *dependents = instructions[count - 1]->dependents;
 154
 155                 for (signed i = count - 2; i >= 0; --i) {
 156                         if (BITSET_TEST(dependents, i))
 157                                 continue;
 158
 159                         BITSET_SET(dependents, i);
 160                         instructions[i]->nr_dependencies++;
 161                 }
 162         }
 163
 164         /* Free the intermediate structures */
 165         for (unsigned i = 0; i < sz; ++i) {
 166                 util_dynarray_fini(&last_read[i]);
 167                 util_dynarray_fini(&last_write[i]);
 168         }
 169 }
 170
 171 /* Create a mask of accessed components from a swizzle to figure out vector
 172  * dependencies */
 173
 174 static unsigned
 175 swizzle_to_access_mask(unsigned swizzle)
 176 {
 177         unsigned component_mask = 0;
 178
 179         for (int i = 0; i < 4; ++i) {
 180                 unsigned c = (swizzle >> (2 * i)) & 3;
 181                 component_mask |= (1 << c);
 182         }
 183
 184         return component_mask;
 185 }
 186
 187 /* Does the mask cover more than a scalar? */
 188
 189 static bool
 190 is_single_component_mask(unsigned mask)
 191 {
 192         int components = 0;
 193
 194         for (int c = 0; c < 8; ++c) {
 195                 if (mask & (1 << c))
 196                         components++;
 197         }
 198
 199         return components == 1;
 200 }
 201
 202 /* Checks for an SSA data hazard between two adjacent instructions, keeping in
 203  * mind that we are a vector architecture and we can write to different
 204  * components simultaneously */
 205
 206 static bool
 207 can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
 208 {
 209         /* Writeout has its own rules anyway */
 210         if (first->compact_branch || second->compact_branch)
 211                 return true;
 212
 213         /* Each instruction reads some registers and writes to a register. See
 214          * where the first writes */
 215
 216         int source = first->dest;
 217         int source_mask = first->mask;
 218
 219         /* As long as the second doesn't read from the first, we're okay */
 220         for (unsigned i = 0; i < ARRAY_SIZE(second->src); ++i) {
 221                 if (second->src[i] != source)
 222                         continue;
 223
 224                 if (first->type != TAG_ALU_4)
 225                         return false;
 226
 227                 /* Figure out which components we just read from */
 228
 229                 int q = (i == 0) ? second->alu.src1 : second->alu.src2;
 230                 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
 231
 232                 /* Check if there are components in common, and fail if so */
 233                 if (swizzle_to_access_mask(m->swizzle) & source_mask)
 234                         return false;
 235         }
 236
 237         /* Otherwise, it's safe in that regard. Another data hazard is both
 238          * writing to the same place, of course */
 239
 240         if (second->dest == source) {
 241                 /* ...but only if the components overlap */
 242
 243                 if (second->mask & source_mask)
 244                         return false;
 245         }
 246
 247         /* ...That's it */
 248         return true;
 249 }
 250
 251 static bool
 252 midgard_has_hazard(
 253         midgard_instruction **segment, unsigned segment_size,
 254         midgard_instruction *ains)
 255 {
 256         for (int s = 0; s < segment_size; ++s)
 257                 if (!can_run_concurrent_ssa(segment[s], ains))
 258                         return true;
 259
 260         return false;
 261
 262
 263 }
 264
 265 /* Fragment writeout (of r0) is allowed when:
 266  *
 267  *  - All components of r0 are written in the bundle
 268  *  - No components of r0 are written in VLUT
 269  *  - Non-pipelined dependencies of r0 are not written in the bundle
 270  *
 271  * This function checks if these requirements are satisfied given the content
 272  * of a scheduled bundle.
 273  */
 274
 275 static bool
 276 can_writeout_fragment(compiler_context *ctx, midgard_instruction **bundle, unsigned count, unsigned node_count, unsigned r0)
 277 {
 278         /* First scan for which components of r0 are written out. Initially
 279          * none are written */
 280
 281         uint8_t r0_written_mask = 0x0;
 282
 283         /* Simultaneously we scan for the set of dependencies */
 284
 285         size_t sz = sizeof(BITSET_WORD) * BITSET_WORDS(node_count);
 286         BITSET_WORD *dependencies = calloc(1, sz);
 287         memset(dependencies, 0, sz);
 288
 289         bool success = false;
 290
 291         for (unsigned i = 0; i < count; ++i) {
 292                 midgard_instruction *ins = bundle[i];
 293
 294                 if (ins->dest != r0)
 295                         continue;
 296
 297                 /* Record written out mask */
 298                 r0_written_mask |= ins->mask;
 299
 300                 /* Record dependencies, but only if they won't become pipeline
 301                  * registers. We know we can't be live after this, because
 302                  * we're writeout at the very end of the shader. So check if
 303                  * they were written before us. */
 304
 305                 unsigned src0 = ins->src[0];
 306                 unsigned src1 = ins->src[1];
 307
 308                 if (!mir_is_written_before(ctx, bundle[0], src0))
 309                         src0 = ~0;
 310
 311                 if (!mir_is_written_before(ctx, bundle[0], src1))
 312                         src1 = ~0;
 313
 314                 if (src0 < node_count)
 315                         BITSET_SET(dependencies, src0);
 316
 317                 if (src1 < node_count)
 318                         BITSET_SET(dependencies, src1);
 319
 320                 /* Requirement 2 */
 321                 if (ins->unit == UNIT_VLUT)
 322                         goto done;
 323         }
 324
 325         /* Requirement 1 */
 326         if ((r0_written_mask & 0xF) != 0xF)
 327                 goto done;
 328
 329         /* Requirement 3 */
 330
 331         for (unsigned i = 0; i < count; ++i) {
 332                 unsigned dest = bundle[i]->dest;
 333
 334                 if (dest < node_count && BITSET_TEST(dependencies, dest))
 335                         goto done;
 336         }
 337
 338         /* Otherwise, we're good to go */
 339         success = true;
 340
 341 done:
 342         free(dependencies);
 343         return success;
 344 }
 345
 346 /* Helpers for scheudling */
 347
 348 static bool
 349 mir_is_scalar(midgard_instruction *ains)
 350 {
 351         /* Does the op support scalar units? */
 352         if (!(alu_opcode_props[ains->alu.op].props & UNITS_SCALAR))
 353                 return false;
 354
 355         /* Do we try to use it as a vector op? */
 356         if (!is_single_component_mask(ains->mask))
 357                 return false;
 358
 359         /* Otherwise, check mode hazards */
 360         bool could_scalar = true;
 361
 362         /* Only 16/32-bit can run on a scalar unit */
 363         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_8;
 364         could_scalar &= ains->alu.reg_mode != midgard_reg_mode_64;
 365         could_scalar &= ains->alu.dest_override == midgard_dest_override_none;
 366
 367         if (ains->alu.reg_mode == midgard_reg_mode_16) {
 368                 /* If we're running in 16-bit mode, we
 369                  * can't have any 8-bit sources on the
 370                  * scalar unit (since the scalar unit
 371                  * doesn't understand 8-bit) */
 372
 373                 midgard_vector_alu_src s1 =
 374                         vector_alu_from_unsigned(ains->alu.src1);
 375
 376                 could_scalar &= !s1.half;
 377
 378                 midgard_vector_alu_src s2 =
 379                         vector_alu_from_unsigned(ains->alu.src2);
 380
 381                 could_scalar &= !s2.half;
 382         }
 383
 384         return could_scalar;
 385 }
 386
 387 /* How many bytes does this ALU instruction add to the bundle? */
 388
 389 static unsigned
 390 bytes_for_instruction(midgard_instruction *ains)
 391 {
 392         if (ains->unit & UNITS_ANY_VECTOR)
 393                 return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
 394         else if (ains->unit == ALU_ENAB_BRANCH)
 395                 return sizeof(midgard_branch_extended);
 396         else if (ains->compact_branch)
 397                 return sizeof(ains->br_compact);
 398         else
 399                 return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
 400 }
 401
 402 /* Schedules, but does not emit, a single basic block. After scheduling, the
 403  * final tag and size of the block are known, which are necessary for branching
 404  * */
 405
 406 static midgard_bundle
 407 schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
 408 {
 409         int instructions_emitted = 0, packed_idx = 0;
 410         midgard_bundle bundle = { 0 };
 411
 412         midgard_instruction *scheduled[5] = { NULL };
 413
 414         uint8_t tag = ins->type;
 415
 416         /* Default to the instruction's tag */
 417         bundle.tag = tag;
 418
 419         switch (ins->type) {
 420         case TAG_ALU_4: {
 421                 uint32_t control = 0;
 422                 size_t bytes_emitted = sizeof(control);
 423
 424                 /* TODO: Constant combining */
 425                 int index = 0, last_unit = 0;
 426
 427                 /* Previous instructions, for the purpose of parallelism */
 428                 midgard_instruction *segment[4] = {0};
 429                 int segment_size = 0;
 430
 431                 instructions_emitted = -1;
 432                 midgard_instruction *pins = ins;
 433
 434                 unsigned constant_count = 0;
 435
 436                 for (;;) {
 437                         midgard_instruction *ains = pins;
 438
 439                         /* Advance instruction pointer */
 440                         if (index) {
 441                                 ains = mir_next_op(pins);
 442                                 pins = ains;
 443                         }
 444
 445                         /* Out-of-work condition */
 446                         if ((struct list_head *) ains == &block->instructions)
 447                                 break;
 448
 449                         /* Ensure that the chain can continue */
 450                         if (ains->type != TAG_ALU_4) break;
 451
 452                         /* If there's already something in the bundle and we
 453                          * have weird scheduler constraints, break now */
 454                         if (ains->precede_break && index) break;
 455
 456                         /* According to the presentation "The ARM
 457                          * Mali-T880 Mobile GPU" from HotChips 27,
 458                          * there are two pipeline stages. Branching
 459                          * position determined experimentally. Lines
 460                          * are executed in parallel:
 461                          *
 462                          * [ VMUL ] [ SADD ]
 463                          * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
 464                          *
 465                          * Verify that there are no ordering dependencies here.
 466                          *
 467                          * TODO: Allow for parallelism!!!
 468                          */
 469
 470                         /* Pick a unit for it if it doesn't force a particular unit */
 471
 472                         int unit = ains->unit;
 473
 474                         if (!unit) {
 475                                 int op = ains->alu.op;
 476                                 int units = alu_opcode_props[op].props;
 477                                 bool scalar = mir_is_scalar(ains);
 478
 479                                 if (!scalar) {
 480                                         if (last_unit >= UNIT_VADD) {
 481                                                 if (units & UNIT_VLUT)
 482                                                         unit = UNIT_VLUT;
 483                                                 else
 484                                                         break;
 485                                         } else {
 486                                                 if ((units & UNIT_VMUL) && last_unit < UNIT_VMUL)
 487                                                         unit = UNIT_VMUL;
 488                                                 else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
 489                                                         unit = UNIT_VADD;
 490                                                 else if (units & UNIT_VLUT)
 491                                                         unit = UNIT_VLUT;
 492                                                 else
 493                                                         break;
 494                                         }
 495                                 } else {
 496                                         if (last_unit >= UNIT_VADD) {
 497                                                 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
 498                                                         unit = UNIT_SMUL;
 499                                                 else if (units & UNIT_VLUT)
 500                                                         unit = UNIT_VLUT;
 501                                                 else
 502                                                         break;
 503                                         } else {
 504                                                 if ((units & UNIT_VMUL) && (last_unit < UNIT_VMUL))
 505                                                         unit = UNIT_VMUL;
 506                                                 else if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains))
 507                                                         unit = UNIT_SADD;
 508                                                 else if (units & UNIT_VADD)
 509                                                         unit = UNIT_VADD;
 510                                                 else if (units & UNIT_SMUL)
 511                                                         unit = UNIT_SMUL;
 512                                                 else if (units & UNIT_VLUT)
 513                                                         unit = UNIT_VLUT;
 514                                                 else
 515                                                         break;
 516                                         }
 517                                 }
 518
 519                                 assert(unit & units);
 520                         }
 521
 522                         /* Late unit check, this time for encoding (not parallelism) */
 523                         if (unit <= last_unit) break;
 524
 525                         /* Clear the segment */
 526                         if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
 527                                 segment_size = 0;
 528
 529                         if (midgard_has_hazard(segment, segment_size, ains))
 530                                 break;
 531
 532                         /* We're good to go -- emit the instruction */
 533                         ains->unit = unit;
 534
 535                         segment[segment_size++] = ains;
 536
 537                         /* We try to reuse constants if possible, by adjusting
 538                          * the swizzle */
 539
 540                         if (ains->has_blend_constant) {
 541                                 /* Everything conflicts with the blend constant */
 542                                 if (bundle.has_embedded_constants)
 543                                         break;
 544
 545                                 bundle.has_blend_constant = 1;
 546                                 bundle.has_embedded_constants = 1;
 547                         } else if (ains->has_constants && ains->alu.reg_mode == midgard_reg_mode_16) {
 548                                 /* TODO: DRY with the analysis pass */
 549
 550                                 if (bundle.has_blend_constant)
 551                                         break;
 552
 553                                 if (constant_count)
 554                                         break;
 555
 556                                 /* TODO: Fix packing XXX */
 557                                 uint16_t *bundles = (uint16_t *) bundle.constants;
 558                                 uint32_t *constants = (uint32_t *) ains->constants;
 559
 560                                 /* Copy them wholesale */
 561                                 for (unsigned i = 0; i < 4; ++i)
 562                                         bundles[i] = constants[i];
 563
 564                                 bundle.has_embedded_constants = true;
 565                                 constant_count = 4;
 566                         } else if (ains->has_constants) {
 567                                 /* By definition, blend constants conflict with
 568                                  * everything, so if there are already
 569                                  * constants we break the bundle *now* */
 570
 571                                 if (bundle.has_blend_constant)
 572                                         break;
 573
 574                                 /* For anything but blend constants, we can do
 575                                  * proper analysis, however */
 576
 577                                 /* TODO: Mask by which are used */
 578                                 uint32_t *constants = (uint32_t *) ains->constants;
 579                                 uint32_t *bundles = (uint32_t *) bundle.constants;
 580
 581                                 uint32_t indices[4] = { 0 };
 582                                 bool break_bundle = false;
 583
 584                                 for (unsigned i = 0; i < 4; ++i) {
 585                                         uint32_t cons = constants[i];
 586                                         bool constant_found = false;
 587
 588                                         /* Search for the constant */
 589                                         for (unsigned j = 0; j < constant_count; ++j) {
 590                                                 if (bundles[j] != cons)
 591                                                         continue;
 592
 593                                                 /* We found it, reuse */
 594                                                 indices[i] = j;
 595                                                 constant_found = true;
 596                                                 break;
 597                                         }
 598
 599                                         if (constant_found)
 600                                                 continue;
 601
 602                                         /* We didn't find it, so allocate it */
 603                                         unsigned idx = constant_count++;
 604
 605                                         if (idx >= 4) {
 606                                                 /* Uh-oh, out of space */
 607                                                 break_bundle = true;
 608                                                 break;
 609                                         }
 610
 611                                         /* We have space, copy it in! */
 612                                         bundles[idx] = cons;
 613                                         indices[i] = idx;
 614                                 }
 615
 616                                 if (break_bundle)
 617                                         break;
 618
 619                                 /* Cool, we have it in. So use indices as a
 620                                  * swizzle */
 621
 622                                 unsigned swizzle = SWIZZLE_FROM_ARRAY(indices);
 623                                 unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
 624
 625                                 if (ains->src[0] == r_constant)
 626                                         ains->alu.src1 = vector_alu_apply_swizzle(ains->alu.src1, swizzle);
 627
 628                                 if (ains->src[1] == r_constant)
 629                                         ains->alu.src2 = vector_alu_apply_swizzle(ains->alu.src2, swizzle);
 630
 631                                 bundle.has_embedded_constants = true;
 632                         }
 633
 634                         if (ains->compact_branch) {
 635                                 /* All of r0 has to be written out along with
 636                                  * the branch writeout */
 637
 638                                 if (ains->writeout && !can_writeout_fragment(ctx, scheduled, index, ctx->temp_count, ains->src[0])) {
 639                                         /* We only work on full moves
 640                                          * at the beginning. We could
 641                                          * probably do better */
 642                                         if (index != 0)
 643                                                 break;
 644
 645                                         /* Inject a move */
 646                                         midgard_instruction ins = v_mov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
 647                                         ins.unit = UNIT_VMUL;
 648                                         control |= ins.unit;
 649
 650                                         /* TODO don't leak */
 651                                         midgard_instruction *move =
 652                                                 mem_dup(&ins, sizeof(midgard_instruction));
 653                                         bytes_emitted += bytes_for_instruction(move);
 654                                         bundle.instructions[packed_idx++] = move;
 655                                 }
 656                         }
 657
 658                         bytes_emitted += bytes_for_instruction(ains);
 659
 660                         /* Defer marking until after writing to allow for break */
 661                         scheduled[index] = ains;
 662                         control |= ains->unit;
 663                         last_unit = ains->unit;
 664                         ++instructions_emitted;
 665                         ++index;
 666                 }
 667
 668                 int padding = 0;
 669
 670                 /* Pad ALU op to nearest word */
 671
 672                 if (bytes_emitted & 15) {
 673                         padding = 16 - (bytes_emitted & 15);
 674                         bytes_emitted += padding;
 675                 }
 676
 677                 /* Constants must always be quadwords */
 678                 if (bundle.has_embedded_constants)
 679                         bytes_emitted += 16;
 680
 681                 /* Size ALU instruction for tag */
 682                 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
 683                 bundle.padding = padding;
 684                 bundle.control = bundle.tag | control;
 685
 686                 break;
 687         }
 688
 689         case TAG_LOAD_STORE_4: {
 690                 /* Load store instructions have two words at once. If
 691                  * we only have one queued up, we need to NOP pad.
 692                  * Otherwise, we store both in succession to save space
 693                  * and cycles -- letting them go in parallel -- skip
 694                  * the next. The usefulness of this optimisation is
 695                  * greatly dependent on the quality of the instruction
 696                  * scheduler.
 697                  */
 698
 699                 midgard_instruction *next_op = mir_next_op(ins);
 700
 701                 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
 702                         /* TODO: Concurrency check */
 703                         instructions_emitted++;
 704                 }
 705
 706                 break;
 707         }
 708
 709         case TAG_TEXTURE_4: {
 710                 /* Which tag we use depends on the shader stage */
 711                 bool in_frag = ctx->stage == MESA_SHADER_FRAGMENT;
 712                 bundle.tag = in_frag ? TAG_TEXTURE_4 : TAG_TEXTURE_4_VTX;
 713                 break;
 714         }
 715
 716         default:
 717                 unreachable("Unknown tag");
 718                 break;
 719         }
 720
 721         /* Copy the instructions into the bundle */
 722         bundle.instruction_count = instructions_emitted + 1 + packed_idx;
 723
 724         midgard_instruction *uins = ins;
 725         for (; packed_idx < bundle.instruction_count; ++packed_idx) {
 726                 assert(&uins->link != &block->instructions);
 727                 bundle.instructions[packed_idx] = uins;
 728                 uins = mir_next_op(uins);
 729         }
 730
 731         *skip = instructions_emitted;
 732
 733         return bundle;
 734 }
 735
 736 /* We would like to flatten the linked list of midgard_instructions in a bundle
 737  * to an array of pointers on the heap for easy indexing */
 738
 739 static midgard_instruction **
 740 flatten_mir(midgard_block *block, unsigned *len)
 741 {
 742         *len = list_length(&block->instructions);
 743
 744         if (!(*len))
 745                 return NULL;
 746
 747         midgard_instruction **instructions =
 748                 calloc(sizeof(midgard_instruction *), *len);
 749
 750         unsigned i = 0;
 751
 752         mir_foreach_instr_in_block(block, ins)
 753                 instructions[i++] = ins;
 754
 755         return instructions;
 756 }
 757
 758 /* The worklist is the set of instructions that can be scheduled now; that is,
 759  * the set of instructions with no remaining dependencies */
 760
 761 static void
 762 mir_initialize_worklist(BITSET_WORD *worklist, midgard_instruction **instructions, unsigned count)
 763 {
 764         for (unsigned i = 0; i < count; ++i) {
 765                 if (instructions[i]->nr_dependencies == 0)
 766                         BITSET_SET(worklist, i);
 767         }
 768 }
 769
 770 /* Update the worklist after an instruction terminates. Remove its edges from
 771  * the graph and if that causes any node to have no dependencies, add it to the
 772  * worklist */
 773
 774 static void
 775 mir_update_worklist(
 776                 BITSET_WORD *worklist, unsigned count,
 777                 midgard_instruction **instructions, midgard_instruction *done)
 778 {
 779         /* Sanity check: if no instruction terminated, there is nothing to do.
 780          * If the instruction that terminated had dependencies, that makes no
 781          * sense and means we messed up the worklist. Finally, as the purpose
 782          * of this routine is to update dependents, we abort early if there are
 783          * no dependents defined. */
 784
 785         if (!done)
 786                 return;
 787
 788         assert(done->nr_dependencies == 0);
 789
 790         if (!done->dependents)
 791                 return;
 792
 793         /* We have an instruction with dependents. Iterate each dependent to
 794          * remove one dependency (`done`), adding dependents to the worklist
 795          * where possible. */
 796
 797         unsigned i;
 798         BITSET_WORD tmp;
 799         BITSET_FOREACH_SET(i, tmp, done->dependents, count) {
 800                 assert(instructions[i]->nr_dependencies);
 801
 802                 if (!(--instructions[i]->nr_dependencies))
 803                         BITSET_SET(worklist, i);
 804         }
 805
 806         free(done->dependents);
 807 }
 808
 809 /* While scheduling, we need to choose instructions satisfying certain
 810  * criteria. As we schedule backwards, we choose the *last* instruction in the
 811  * worklist to simulate in-order scheduling. Chosen instructions must satisfy a
 812  * given predicate. */
 813
 814 struct midgard_predicate {
 815         /* TAG or ~0 for dont-care */
 816         unsigned tag;
 817
 818         /* True if we want to pop off the chosen instruction */
 819         bool destructive;
 820
 821         /* For ALU, choose only this unit */
 822         unsigned unit;
 823
 824         /* State for bundle constants. constants is the actual constants
 825          * for the bundle. constant_count is the number of bytes (up to
 826          * 16) currently in use for constants. When picking in destructive
 827          * mode, the constants array will be updated, and the instruction
 828          * will be adjusted to index into the constants array */
 829
 830         uint8_t *constants;
 831         unsigned constant_count;
 832         bool blend_constant;
 833
 834         /* Exclude this destination (if not ~0) */
 835         unsigned exclude;
 836 };
 837
 838 /* For an instruction that can fit, adjust it to fit and update the constants
 839  * array, in destructive mode. Returns whether the fitting was successful. */
 840
 841 static bool
 842 mir_adjust_constants(midgard_instruction *ins,
 843                 struct midgard_predicate *pred,
 844                 bool destructive)
 845 {
 846         /* Blend constants dominate */
 847         if (ins->has_blend_constant) {
 848                 if (pred->constant_count)
 849                         return false;
 850                 else if (destructive) {
 851                         pred->blend_constant = true;
 852                         pred->constant_count = 16;
 853                         return true;
 854                 }
 855         }
 856
 857         /* No constant, nothing to adjust */
 858         if (!ins->has_constants)
 859                 return true;
 860
 861         /* TODO: Deduplicate; permit multiple constants within a bundle */
 862
 863         if (destructive && !pred->constant_count) {
 864                 if (ins->alu.reg_mode == midgard_reg_mode_16) {
 865                       /* TODO: Fix packing XXX */
 866                         uint16_t *bundles = (uint16_t *) pred->constants;
 867                         uint32_t *constants = (uint32_t *) ins->constants;
 868
 869                         /* Copy them wholesale */
 870                         for (unsigned i = 0; i < 4; ++i)
 871                                 bundles[i] = constants[i];
 872                 } else {
 873                         memcpy(pred->constants, ins->constants, 16);
 874                 }
 875
 876                 pred->constant_count = 16;
 877                 return true;
 878         }
 879
 880         return !pred->constant_count;
 881 }
 882
 883 static midgard_instruction *
 884 mir_choose_instruction(
 885                 midgard_instruction **instructions,
 886                 BITSET_WORD *worklist, unsigned count,
 887                 struct midgard_predicate *predicate)
 888 {
 889         /* Parse the predicate */
 890         unsigned tag = predicate->tag;
 891         bool alu = tag == TAG_ALU_4;
 892         unsigned unit = predicate->unit;
 893         bool branch = alu && (unit == ALU_ENAB_BR_COMPACT);
 894
 895         /* Iterate to find the best instruction satisfying the predicate */
 896         unsigned i;
 897         BITSET_WORD tmp;
 898
 899         signed best_index = -1;
 900
 901         BITSET_FOREACH_SET(i, tmp, worklist, count) {
 902                 if (tag != ~0 && instructions[i]->type != tag)
 903                         continue;
 904
 905                 if (predicate->exclude != ~0 && instructions[i]->dest == predicate->exclude)
 906                         continue;
 907
 908                 if (alu && !branch && !(alu_opcode_props[instructions[i]->alu.op].props & unit))
 909                         continue;
 910
 911                 if (branch && !instructions[i]->compact_branch)
 912                         continue;
 913
 914                 /* Simulate in-order scheduling */
 915                 if ((signed) i < best_index)
 916                         continue;
 917
 918                 best_index = i;
 919         }
 920
 921
 922         /* Did we find anything?  */
 923
 924         if (best_index < 0)
 925                 return NULL;
 926
 927         /* If we found something, remove it from the worklist */
 928         assert(best_index < count);
 929
 930         if (predicate->destructive) {
 931                 BITSET_CLEAR(worklist, best_index);
 932         }
 933
 934         return instructions[best_index];
 935 }
 936
 937 /* Still, we don't choose instructions in a vacuum. We need a way to choose the
 938  * best bundle type (ALU, load/store, texture). Nondestructive. */
 939
 940 static unsigned
 941 mir_choose_bundle(
 942                 midgard_instruction **instructions,
 943                 BITSET_WORD *worklist, unsigned count)
 944 {
 945         /* At the moment, our algorithm is very simple - use the bundle of the
 946          * best instruction, regardless of what else could be scheduled
 947          * alongside it. This is not optimal but it works okay for in-order */
 948
 949         struct midgard_predicate predicate = {
 950                 .tag = ~0,
 951                 .destructive = false,
 952                 .exclude = ~0
 953         };
 954
 955         midgard_instruction *chosen = mir_choose_instruction(instructions, worklist, count, &predicate);
 956
 957         if (chosen)
 958                 return chosen->type;
 959         else
 960                 return ~0;
 961 }
 962
 963 /* We want to choose an ALU instruction filling a given unit */
 964 static void
 965 mir_choose_alu(midgard_instruction **slot,
 966                 midgard_instruction **instructions,
 967                 BITSET_WORD *worklist, unsigned len,
 968                 struct midgard_predicate *predicate,
 969                 unsigned unit)
 970 {
 971         /* Did we already schedule to this slot? */
 972         if ((*slot) != NULL)
 973                 return;
 974
 975         /* Try to schedule something, if not */
 976         predicate->unit = unit;
 977         *slot = mir_choose_instruction(instructions, worklist, len, predicate);
 978
 979         /* Store unit upon scheduling */
 980         if (*slot && !((*slot)->compact_branch))
 981                 (*slot)->unit = unit;
 982 }
 983
 984 /* When we are scheduling a branch/csel, we need the consumed condition in the
 985  * same block as a pipeline register. There are two options to enable this:
 986  *
 987  *  - Move the conditional into the bundle. Preferred, but only works if the
 988  *    conditional is used only once and is from this block.
 989  *  - Copy the conditional.
 990  *
 991  * We search for the conditional. If it's in this block, single-use, and
 992  * without embedded constants, we schedule it immediately. Otherwise, we
 993  * schedule a move for it.
 994  *
 995  * mir_comparison_mobile is a helper to find the moveable condition.
 996  */
 997
 998 static unsigned
 999 mir_comparison_mobile(
1000                 compiler_context *ctx,
1001                 midgard_instruction **instructions,
1002                 unsigned count,
1003                 unsigned cond)
1004 {
1005         if (!mir_single_use(ctx, cond))
1006                 return ~0;
1007
1008         unsigned ret = ~0;
1009
1010         for (unsigned i = 0; i < count; ++i) {
1011                 if (instructions[i]->dest != cond)
1012                         continue;
1013
1014                 /* Must fit in an ALU bundle */
1015                 if (instructions[i]->type != TAG_ALU_4)
1016                         return ~0;
1017
1018                 /* We'll need to rewrite to .w but that doesn't work for vector
1019                  * ops that don't replicate (ball/bany), so bail there */
1020
1021                 if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->alu.op].props))
1022                         return ~0;
1023
1024                 /* TODO: moving conditionals with constants */
1025
1026                 if (instructions[i]->has_constants)
1027                         return ~0;
1028
1029                 /* Ensure it is written only once */
1030
1031                 if (ret != ~0)
1032                         return ~0;
1033                 else
1034                         ret = i;
1035         }
1036
1037         return ret;
1038 }
1039
1040 /* Using the information about the moveable conditional itself, we either pop
1041  * that condition off the worklist for use now, or create a move to
1042  * artificially schedule instead as a fallback */
1043
1044 static midgard_instruction *
1045 mir_schedule_comparison(
1046                 compiler_context *ctx,
1047                 midgard_instruction **instructions,
1048                 BITSET_WORD *worklist, unsigned count,
1049                 unsigned cond, bool vector, unsigned swizzle,
1050                 midgard_instruction *user)
1051 {
1052         /* TODO: swizzle when scheduling */
1053         unsigned comp_i =
1054                 (!vector && (swizzle == 0)) ?
1055                 mir_comparison_mobile(ctx, instructions, count, cond) : ~0;
1056
1057         /* If we can, schedule the condition immediately */
1058         if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) {
1059                 assert(comp_i < count);
1060                 BITSET_CLEAR(worklist, comp_i);
1061                 return instructions[comp_i];
1062         }
1063
1064         /* Otherwise, we insert a move */
1065         midgard_vector_alu_src csel = {
1066                 .swizzle = swizzle
1067         };
1068
1069         midgard_instruction mov = v_mov(cond, csel, cond);
1070         mov.mask = vector ? 0xF : 0x1;
1071
1072         return mir_insert_instruction_before(ctx, user, mov);
1073 }
1074
1075 /* Most generally, we need instructions writing to r31 in the appropriate
1076  * components */
1077
1078 static midgard_instruction *
1079 mir_schedule_condition(compiler_context *ctx,
1080                 struct midgard_predicate *predicate,
1081                 BITSET_WORD *worklist, unsigned count,
1082                 midgard_instruction **instructions,
1083                 midgard_instruction *last)
1084 {
1085         /* For a branch, the condition is the only argument; for csel, third */
1086         bool branch = last->compact_branch;
1087         unsigned condition_index = branch ? 0 : 2;
1088
1089         /* csel_v is vector; otherwise, conditions are scalar */
1090         bool vector = !branch && OP_IS_CSEL_V(last->alu.op);
1091
1092         /* Grab the conditional instruction */
1093
1094         midgard_instruction *cond = mir_schedule_comparison(
1095                         ctx, instructions, worklist, count, last->src[condition_index],
1096                         vector, last->cond_swizzle, last);
1097
1098         /* We have exclusive reign over this (possibly move) conditional
1099          * instruction. We can rewrite into a pipeline conditional register */
1100
1101         predicate->exclude = cond->dest;
1102         cond->dest = SSA_FIXED_REGISTER(31);
1103
1104         if (!vector) {
1105                 cond->mask = (1 << COMPONENT_W);
1106
1107                 mir_foreach_src(cond, s) {
1108                         if (cond->src[s] == ~0)
1109                                 continue;
1110
1111                         mir_set_swizzle(cond, s, (mir_get_swizzle(cond, s) << (2*3)) & 0xFF);
1112                 }
1113         }
1114
1115         /* Schedule the unit: csel is always in the latter pipeline, so a csel
1116          * condition must be in the former pipeline stage (vmul/sadd),
1117          * depending on scalar/vector of the instruction itself. A branch must
1118          * be written from the latter pipeline stage and a branch condition is
1119          * always scalar, so it is always in smul (exception: ball/bany, which
1120          * will be vadd) */
1121
1122         if (branch)
1123                 cond->unit = UNIT_SMUL;
1124         else
1125                 cond->unit = vector ? UNIT_VMUL : UNIT_SADD;
1126
1127         return cond;
1128 }
1129
1130 /* Schedules a single bundle of the given type */
1131
1132 static midgard_bundle
1133 mir_schedule_texture(
1134                 midgard_instruction **instructions,
1135                 BITSET_WORD *worklist, unsigned len)
1136 {
1137         struct midgard_predicate predicate = {
1138                 .tag = TAG_TEXTURE_4,
1139                 .destructive = true,
1140                 .exclude = ~0
1141         };
1142
1143         midgard_instruction *ins =
1144                 mir_choose_instruction(instructions, worklist, len, &predicate);
1145
1146         mir_update_worklist(worklist, len, instructions, ins);
1147
1148         struct midgard_bundle out = {
1149                 .tag = TAG_TEXTURE_4,
1150                 .instruction_count = 1,
1151                 .instructions = { ins }
1152         };
1153
1154         return out;
1155 }
1156
1157 static midgard_bundle
1158 mir_schedule_ldst(
1159                 midgard_instruction **instructions,
1160                 BITSET_WORD *worklist, unsigned len)
1161 {
1162         struct midgard_predicate predicate = {
1163                 .tag = TAG_LOAD_STORE_4,
1164                 .destructive = true,
1165                 .exclude = ~0
1166         };
1167
1168         /* Try to pick two load/store ops. Second not gauranteed to exist */
1169
1170         midgard_instruction *ins =
1171                 mir_choose_instruction(instructions, worklist, len, &predicate);
1172
1173         midgard_instruction *pair =
1174                 mir_choose_instruction(instructions, worklist, len, &predicate);
1175
1176         struct midgard_bundle out = {
1177                 .tag = TAG_LOAD_STORE_4,
1178                 .instruction_count = pair ? 2 : 1,
1179                 .instructions = { ins, pair }
1180         };
1181
1182         /* We have to update the worklist atomically, since the two
1183          * instructions run concurrently (TODO: verify it's not pipelined) */
1184
1185         mir_update_worklist(worklist, len, instructions, ins);
1186         mir_update_worklist(worklist, len, instructions, pair);
1187
1188         return out;
1189 }
1190
1191 static midgard_bundle
1192 mir_schedule_alu(
1193                 compiler_context *ctx,
1194                 midgard_instruction **instructions,
1195                 BITSET_WORD *worklist, unsigned len)
1196 {
1197         struct midgard_bundle bundle = {};
1198
1199         unsigned bytes_emitted = sizeof(bundle.control);
1200
1201         struct midgard_predicate predicate = {
1202                 .tag = TAG_ALU_4,
1203                 .destructive = true,
1204                 .exclude = ~0
1205         };
1206
1207         midgard_instruction *ins =
1208                 mir_choose_instruction(instructions, worklist, len, &predicate);
1209
1210         midgard_instruction *vmul = NULL;
1211         midgard_instruction *vadd = NULL;
1212         midgard_instruction *vlut = NULL;
1213         midgard_instruction *smul = NULL;
1214         midgard_instruction *sadd = NULL;
1215         midgard_instruction *branch = NULL;
1216
1217         mir_update_worklist(worklist, len, instructions, ins);
1218
1219         if (ins->compact_branch) {
1220                 branch = ins;
1221         } else if (!ins->unit) {
1222                 unsigned units = alu_opcode_props[ins->alu.op].props;
1223
1224                 if (units & UNIT_VMUL) {
1225                         ins->unit = UNIT_VMUL;
1226                         vmul = ins;
1227                 } else if (units & UNIT_VADD) {
1228                         ins->unit = UNIT_VADD;
1229                         vadd = ins;
1230                 } else if (units & UNIT_VLUT) {
1231                         ins->unit = UNIT_VLUT;
1232                         vlut = ins;
1233                 } else
1234                         assert(0);
1235         }
1236
1237         bundle.has_embedded_constants = ins->has_constants;
1238         bundle.has_blend_constant = ins->has_blend_constant;
1239
1240         if (ins->alu.reg_mode == midgard_reg_mode_16) {
1241               /* TODO: Fix packing XXX */
1242                 uint16_t *bundles = (uint16_t *) bundle.constants;
1243                 uint32_t *constants = (uint32_t *) ins->constants;
1244
1245                 /* Copy them wholesale */
1246                 for (unsigned i = 0; i < 4; ++i)
1247                         bundles[i] = constants[i];
1248         } else {
1249                 memcpy(bundle.constants, ins->constants, sizeof(bundle.constants));
1250         }
1251
1252         if (ins->writeout) {
1253                 unsigned src = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0];
1254                 unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : make_compiler_temp(ctx);
1255                 midgard_instruction mov = v_mov(src, blank_alu_src, temp);
1256                 vmul = mem_dup(&mov, sizeof(midgard_instruction));
1257                 vmul->unit = UNIT_VMUL;
1258                 vmul->mask = 0xF;
1259                 /* TODO: Don't leak */
1260
1261                 /* Rewrite to use our temp */
1262                 midgard_instruction *stages[] = { sadd, vadd, smul };
1263
1264                 for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1265                         if (stages[i])
1266                                 mir_rewrite_index_dst_single(stages[i], src, temp);
1267                 }
1268
1269                 mir_rewrite_index_src_single(branch, src, temp);
1270         }
1271
1272         if ((vadd && OP_IS_CSEL(vadd->alu.op)) || (smul && OP_IS_CSEL(smul->alu.op)) || (ins->compact_branch && !ins->prepacked_branch && ins->branch.conditional)) {
1273                 midgard_instruction *cond = mir_choose_instruction(instructions, worklist, len, &predicate);
1274                 mir_update_worklist(worklist, len, instructions, cond);
1275
1276                 if (!cond->unit) {
1277                         unsigned units = alu_opcode_props[cond->alu.op].props;
1278
1279                         if (units & UNIT_VMUL) {
1280                                 cond->unit = UNIT_VMUL;
1281                         } else if (units & UNIT_VADD) {
1282                                 cond->unit = UNIT_VADD;
1283                         } else
1284                                 assert(0);
1285                 }
1286
1287                 if (cond->unit & UNIT_VMUL)
1288                         vmul = cond;
1289                 else if (cond->unit & UNIT_SADD)
1290                         sadd = cond;
1291                 else if (cond->unit & UNIT_VADD)
1292                         vadd = cond;
1293                 else if (cond->unit & UNIT_SMUL)
1294                         smul = cond;
1295                 else
1296                         unreachable("Bad condition");
1297         }
1298
1299         unsigned padding = 0;
1300
1301         /* Now that we have finished scheduling, build up the bundle */
1302         midgard_instruction *stages[] = { vmul, sadd, vadd, smul, vlut, branch };
1303
1304         for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1305                 if (stages[i]) {
1306                         bundle.control |= stages[i]->unit;
1307                         bytes_emitted += bytes_for_instruction(stages[i]);
1308                         bundle.instructions[bundle.instruction_count++] = stages[i];
1309                 }
1310         }
1311
1312         /* Pad ALU op to nearest word */
1313
1314         if (bytes_emitted & 15) {
1315                 padding = 16 - (bytes_emitted & 15);
1316                 bytes_emitted += padding;
1317         }
1318
1319         /* Constants must always be quadwords */
1320         if (bundle.has_embedded_constants)
1321                 bytes_emitted += 16;
1322
1323         /* Size ALU instruction for tag */
1324         bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
1325         bundle.padding = padding;
1326         bundle.control |= bundle.tag;
1327
1328         return bundle;
1329 }
1330
1331 /* Schedule a single block by iterating its instruction to create bundles.
1332  * While we go, tally about the bundle sizes to compute the block size. */
1333
1334
1335 static void
1336 schedule_block(compiler_context *ctx, midgard_block *block)
1337 {
1338         /* Copy list to dynamic array */
1339         unsigned len = 0;
1340         midgard_instruction **instructions = flatten_mir(block, &len);
1341
1342         /* Calculate dependencies and initial worklist */
1343         unsigned node_count = ctx->temp_count + 1;
1344         mir_create_dependency_graph(instructions, len, node_count);
1345
1346         /* Allocate the worklist */
1347         size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
1348         BITSET_WORD *worklist = calloc(sz, 1);
1349         mir_initialize_worklist(worklist, instructions, len);
1350
1351         util_dynarray_init(&block->bundles, NULL);
1352
1353         block->quadword_count = 0;
1354
1355         int skip = 0;
1356         mir_foreach_instr_in_block(block, ins) {
1357                 if (skip) {
1358                         skip--;
1359                         continue;
1360                 }
1361
1362                 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
1363                 util_dynarray_append(&block->bundles, midgard_bundle, bundle);
1364
1365                 if (bundle.has_blend_constant) {
1366                         unsigned offset = ctx->quadword_count + block->quadword_count + quadword_size(bundle.tag) - 1;
1367                         ctx->blend_constant_offset = offset * 0x10;
1368                 }
1369
1370                 block->quadword_count += quadword_size(bundle.tag);
1371         }
1372
1373         block->is_scheduled = true;
1374         ctx->quadword_count += block->quadword_count;
1375 }
1376
1377 /* When we're 'squeezing down' the values in the IR, we maintain a hash
1378  * as such */
1379
1380 static unsigned
1381 find_or_allocate_temp(compiler_context *ctx, unsigned hash)
1382 {
1383         if (hash >= SSA_FIXED_MINIMUM)
1384                 return hash;
1385
1386         unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
1387                                 ctx->hash_to_temp, hash + 1);
1388
1389         if (temp)
1390                 return temp - 1;
1391
1392         /* If no temp is find, allocate one */
1393         temp = ctx->temp_count++;
1394         ctx->max_hash = MAX2(ctx->max_hash, hash);
1395
1396         _mesa_hash_table_u64_insert(ctx->hash_to_temp,
1397                                     hash + 1, (void *) ((uintptr_t) temp + 1));
1398
1399         return temp;
1400 }
1401
1402 /* Reassigns numbering to get rid of gaps in the indices */
1403
1404 static void
1405 mir_squeeze_index(compiler_context *ctx)
1406 {
1407         /* Reset */
1408         ctx->temp_count = 0;
1409         /* TODO don't leak old hash_to_temp */
1410         ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
1411
1412         mir_foreach_instr_global(ctx, ins) {
1413                 ins->dest = find_or_allocate_temp(ctx, ins->dest);
1414
1415                 for (unsigned i = 0; i < ARRAY_SIZE(ins->src); ++i)
1416                         ins->src[i] = find_or_allocate_temp(ctx, ins->src[i]);
1417         }
1418 }
1419
1420 static midgard_instruction
1421 v_load_store_scratch(
1422                 unsigned srcdest,
1423                 unsigned index,
1424                 bool is_store,
1425                 unsigned mask)
1426 {
1427         /* We index by 32-bit vec4s */
1428         unsigned byte = (index * 4 * 4);
1429
1430         midgard_instruction ins = {
1431                 .type = TAG_LOAD_STORE_4,
1432                 .mask = mask,
1433                 .dest = ~0,
1434                 .src = { ~0, ~0, ~0 },
1435                 .load_store = {
1436                         .op = is_store ? midgard_op_st_int4 : midgard_op_ld_int4,
1437                         .swizzle = SWIZZLE_XYZW,
1438
1439                         /* For register spilling - to thread local storage */
1440                         .arg_1 = 0xEA,
1441                         .arg_2 = 0x1E,
1442
1443                         /* Splattered across, TODO combine logically */
1444                         .varying_parameters = (byte & 0x1FF) << 1,
1445                         .address = (byte >> 9)
1446                 },
1447
1448                 /* If we spill an unspill, RA goes into an infinite loop */
1449                 .no_spill = true
1450         };
1451
1452        if (is_store) {
1453                 /* r0 = r26, r1 = r27 */
1454                 assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
1455                 ins.src[0] = srcdest;
1456         } else {
1457                 ins.dest = srcdest;
1458         }
1459
1460         return ins;
1461 }
1462
1463 /* If register allocation fails, find the best spill node and spill it to fix
1464  * whatever the issue was. This spill node could be a work register (spilling
1465  * to thread local storage), but it could also simply be a special register
1466  * that needs to spill to become a work register. */
1467
1468 static void mir_spill_register(
1469                 compiler_context *ctx,
1470                 struct ra_graph *g,
1471                 unsigned *spill_count)
1472 {
1473         unsigned spill_index = ctx->temp_count;
1474
1475         /* Our first step is to calculate spill cost to figure out the best
1476          * spill node. All nodes are equal in spill cost, but we can't spill
1477          * nodes written to from an unspill */
1478
1479         for (unsigned i = 0; i < ctx->temp_count; ++i) {
1480                 ra_set_node_spill_cost(g, i, 1.0);
1481         }
1482
1483         /* We can't spill any bundles that contain unspills. This could be
1484          * optimized to allow use of r27 to spill twice per bundle, but if
1485          * you're at the point of optimizing spilling, it's too late. */
1486
1487         mir_foreach_block(ctx, block) {
1488                 mir_foreach_bundle_in_block(block, bun) {
1489                         bool no_spill = false;
1490
1491                         for (unsigned i = 0; i < bun->instruction_count; ++i)
1492                                 no_spill |= bun->instructions[i]->no_spill;
1493
1494                         if (!no_spill)
1495                                 continue;
1496
1497                         for (unsigned i = 0; i < bun->instruction_count; ++i) {
1498                                 unsigned dest = bun->instructions[i]->dest;
1499                                 if (dest < ctx->temp_count)
1500                                         ra_set_node_spill_cost(g, dest, -1.0);
1501                         }
1502                 }
1503         }
1504
1505         int spill_node = ra_get_best_spill_node(g);
1506
1507         if (spill_node < 0) {
1508                 mir_print_shader(ctx);
1509                 assert(0);
1510         }
1511
1512         /* We have a spill node, so check the class. Work registers
1513          * legitimately spill to TLS, but special registers just spill to work
1514          * registers */
1515
1516         unsigned class = ra_get_node_class(g, spill_node);
1517         bool is_special = (class >> 2) != REG_CLASS_WORK;
1518         bool is_special_w = (class >> 2) == REG_CLASS_TEXW;
1519
1520         /* Allocate TLS slot (maybe) */
1521         unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
1522
1523         /* For TLS, replace all stores to the spilled node. For
1524          * special reads, just keep as-is; the class will be demoted
1525          * implicitly. For special writes, spill to a work register */
1526
1527         if (!is_special || is_special_w) {
1528                 if (is_special_w)
1529                         spill_slot = spill_index++;
1530
1531                 mir_foreach_block(ctx, block) {
1532                 mir_foreach_instr_in_block_safe(block, ins) {
1533                         if (ins->dest != spill_node) continue;
1534
1535                         midgard_instruction st;
1536
1537                         if (is_special_w) {
1538                                 st = v_mov(spill_node, blank_alu_src, spill_slot);
1539                                 st.no_spill = true;
1540                         } else {
1541                                 ins->dest = SSA_FIXED_REGISTER(26);
1542                                 ins->no_spill = true;
1543                                 st = v_load_store_scratch(ins->dest, spill_slot, true, ins->mask);
1544                         }
1545
1546                         /* Hint: don't rewrite this node */
1547                         st.hint = true;
1548
1549                         mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1550
1551                         if (!is_special)
1552                                 ctx->spills++;
1553                 }
1554                 }
1555         }
1556
1557         /* For special reads, figure out how many components we need */
1558         unsigned read_mask = 0;
1559
1560         mir_foreach_instr_global_safe(ctx, ins) {
1561                 read_mask |= mir_mask_of_read_components(ins, spill_node);
1562         }
1563
1564         /* Insert a load from TLS before the first consecutive
1565          * use of the node, rewriting to use spilled indices to
1566          * break up the live range. Or, for special, insert a
1567          * move. Ironically the latter *increases* register
1568          * pressure, but the two uses of the spilling mechanism
1569          * are somewhat orthogonal. (special spilling is to use
1570          * work registers to back special registers; TLS
1571          * spilling is to use memory to back work registers) */
1572
1573         mir_foreach_block(ctx, block) {
1574                 bool consecutive_skip = false;
1575                 unsigned consecutive_index = 0;
1576
1577                 mir_foreach_instr_in_block(block, ins) {
1578                         /* We can't rewrite the moves used to spill in the
1579                          * first place. These moves are hinted. */
1580                         if (ins->hint) continue;
1581
1582                         if (!mir_has_arg(ins, spill_node)) {
1583                                 consecutive_skip = false;
1584                                 continue;
1585                         }
1586
1587                         if (consecutive_skip) {
1588                                 /* Rewrite */
1589                                 mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1590                                 continue;
1591                         }
1592
1593                         if (!is_special_w) {
1594                                 consecutive_index = ++spill_index;
1595
1596                                 midgard_instruction *before = ins;
1597
1598                                 /* For a csel, go back one more not to break up the bundle */
1599                                 if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
1600                                         before = mir_prev_op(before);
1601
1602                                 midgard_instruction st;
1603
1604                                 if (is_special) {
1605                                         /* Move */
1606                                         st = v_mov(spill_node, blank_alu_src, consecutive_index);
1607                                         st.no_spill = true;
1608                                 } else {
1609                                         /* TLS load */
1610                                         st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
1611                                 }
1612
1613                                 /* Mask the load based on the component count
1614                                  * actually needed to prvent RA loops */
1615
1616                                 st.mask = read_mask;
1617
1618                                 mir_insert_instruction_before_scheduled(ctx, block, before, st);
1619                                // consecutive_skip = true;
1620                         } else {
1621                                 /* Special writes already have their move spilled in */
1622                                 consecutive_index = spill_slot;
1623                         }
1624
1625
1626                         /* Rewrite to use */
1627                         mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
1628
1629                         if (!is_special)
1630                                 ctx->fills++;
1631                 }
1632         }
1633
1634         /* Reset hints */
1635
1636         mir_foreach_instr_global(ctx, ins) {
1637                 ins->hint = false;
1638         }
1639 }
1640
1641 void
1642 schedule_program(compiler_context *ctx)
1643 {
1644         struct ra_graph *g = NULL;
1645         bool spilled = false;
1646         int iter_count = 1000; /* max iterations */
1647
1648         /* Number of 128-bit slots in memory we've spilled into */
1649         unsigned spill_count = 0;
1650
1651         midgard_promote_uniforms(ctx, 16);
1652
1653         /* Must be lowered right before RA */
1654         mir_squeeze_index(ctx);
1655         mir_lower_special_reads(ctx);
1656         mir_squeeze_index(ctx);
1657
1658         /* Lowering can introduce some dead moves */
1659
1660         mir_foreach_block(ctx, block) {
1661                 midgard_opt_dead_move_eliminate(ctx, block);
1662                 schedule_block(ctx, block);
1663         }
1664
1665         mir_create_pipeline_registers(ctx);
1666
1667         do {
1668                 if (spilled)
1669                         mir_spill_register(ctx, g, &spill_count);
1670
1671                 mir_squeeze_index(ctx);
1672
1673                 g = NULL;
1674                 g = allocate_registers(ctx, &spilled);
1675         } while(spilled && ((iter_count--) > 0));
1676
1677         if (iter_count <= 0) {
1678                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1679                 assert(0);
1680         }
1681
1682         /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1683          * fp32), but tls_size is in bytes, so multiply by 16 */
1684
1685         ctx->tls_size = spill_count * 16;
1686
1687         install_registers(ctx, g);
1688 }