src/gallium/drivers/freedreno/ir3/ir3_ra.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29 #include "util/u_math.h"
  30 #include "util/register_allocate.h"
  31 #include "util/ralloc.h"
  32 #include "util/bitset.h"
  33
  34 #include "ir3.h"
  35 #include "ir3_compiler.h"
  36
  37 /*
  38  * Register Assignment:
  39  *
  40  * Uses the register_allocate util, which implements graph coloring
  41  * algo with interference classes.  To handle the cases where we need
  42  * consecutive registers (for example, texture sample instructions),
  43  * we model these as larger (double/quad/etc) registers which conflict
  44  * with the corresponding registers in other classes.
  45  *
  46  * Additionally we create additional classes for half-regs, which
  47  * do not conflict with the full-reg classes.  We do need at least
  48  * sizes 1-4 (to deal w/ texture sample instructions output to half-
  49  * reg).  At the moment we don't create the higher order half-reg
  50  * classes as half-reg frequently does not have enough precision
  51  * for texture coords at higher resolutions.
  52  *
  53  * There are some additional cases that we need to handle specially,
  54  * as the graph coloring algo doesn't understand "partial writes".
  55  * For example, a sequence like:
  56  *
  57  *   add r0.z, ...
  58  *   sam (f32)(xy)r0.x, ...
  59  *   ...
  60  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
  61  *
  62  * In this scenario, we treat r0.xyz as class size 3, which is written
  63  * (from a use/def perspective) at the 'add' instruction and ignore the
  64  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
  65  * defining instruction, as it is the first to partially write r0.xyz.
  66  *
  67  * Note i965 has a similar scenario, which they solve with a virtual
  68  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
  69  * register assignment.  But for us that is horrible from a scheduling
  70  * standpoint.  Instead what we do is use idea of 'definer' instruction.
  71  * Ie. the first instruction (lowest ip) to write to the array is the
  72  * one we consider from use/def perspective when building interference
  73  * graph.  (Other instructions which write other array elements just
  74  * define the variable some more.)
  75  */
  76
  77 static const unsigned class_sizes[] = {
  78         1, 2, 3, 4,
  79         4 + 4, /* txd + 1d/2d */
  80         4 + 6, /* txd + 3d */
  81         /* temporary: until we can assign arrays, create classes so we
  82          * can round up array to fit.  NOTE with tgsi arrays should
  83          * really all be multiples of four:
  84          */
  85         4 * 4,
  86         4 * 8,
  87         4 * 16,
  88         4 * 32,
  89
  90 };
  91 #define class_count ARRAY_SIZE(class_sizes)
  92
  93 static const unsigned half_class_sizes[] = {
  94         1, 2, 3, 4,
  95 };
  96 #define half_class_count  ARRAY_SIZE(half_class_sizes)
  97 #define total_class_count (class_count + half_class_count)
  98
  99 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
 100 #define NUM_REGS             (4 * (REG_A0 - 1))
 101 /* Number of virtual regs in a given class: */
 102 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
 103 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
 104
 105 /* register-set, created one time, used for all shaders: */
 106 struct ir3_ra_reg_set {
 107         struct ra_regs *regs;
 108         unsigned int classes[class_count];
 109         unsigned int half_classes[half_class_count];
 110         /* maps flat virtual register space to base gpr: */
 111         uint16_t *ra_reg_to_gpr;
 112         /* maps cls,gpr to flat virtual register space: */
 113         uint16_t **gpr_to_ra_reg;
 114 };
 115
 116 /* One-time setup of RA register-set, which describes all the possible
 117  * "virtual" registers and their interferences.  Ie. double register
 118  * occupies (and conflicts with) two single registers, and so forth.
 119  * Since registers do not need to be aligned to their class size, they
 120  * can conflict with other registers in the same class too.  Ie:
 121  *
 122  *    Single (base) |  Double
 123  *    --------------+---------------
 124  *       R0         |  D0
 125  *       R1         |  D0 D1
 126  *       R2         |     D1 D2
 127  *       R3         |        D2
 128  *           .. and so on..
 129  *
 130  * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
 131  * really just four scalar registers.  Don't let that confuse you.)
 132  */
 133 struct ir3_ra_reg_set *
 134 ir3_ra_alloc_reg_set(void *memctx)
 135 {
 136         struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
 137         unsigned ra_reg_count, reg, first_half_reg;
 138         unsigned int **q_values;
 139
 140         /* calculate # of regs across all classes: */
 141         ra_reg_count = 0;
 142         for (unsigned i = 0; i < class_count; i++)
 143                 ra_reg_count += CLASS_REGS(i);
 144         for (unsigned i = 0; i < half_class_count; i++)
 145                 ra_reg_count += HALF_CLASS_REGS(i);
 146
 147         /* allocate and populate q_values: */
 148         q_values = ralloc_array(set, unsigned *, total_class_count);
 149         for (unsigned i = 0; i < class_count; i++) {
 150                 q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
 151
 152                 /* From register_allocate.c:
 153                  *
 154                  * q(B,C) (indexed by C, B is this register class) in
 155                  * Runeson/Nyström paper.  This is "how many registers of B could
 156                  * the worst choice register from C conflict with".
 157                  *
 158                  * If we just let the register allocation algorithm compute these
 159                  * values, is extremely expensive.  However, since all of our
 160                  * registers are laid out, we can very easily compute them
 161                  * ourselves.  View the register from C as fixed starting at GRF n
 162                  * somewhere in the middle, and the register from B as sliding back
 163                  * and forth.  Then the first register to conflict from B is the
 164                  * one starting at n - class_size[B] + 1 and the last register to
 165                  * conflict will start at n + class_size[B] - 1.  Therefore, the
 166                  * number of conflicts from B is class_size[B] + class_size[C] - 1.
 167                  *
 168                  *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
 169                  * B | | | | | |n| --> | | | | | | |
 170                  *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
 171                  *             +-+-+-+-+-+
 172                  * C           |n| | | | |
 173                  *             +-+-+-+-+-+
 174                  *
 175                  * (Idea copied from brw_fs_reg_allocate.cpp)
 176                  */
 177                 for (unsigned j = 0; j < class_count; j++)
 178                         q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
 179         }
 180
 181         for (unsigned i = class_count; i < total_class_count; i++) {
 182                 q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
 183
 184                 /* see comment above: */
 185                 for (unsigned j = class_count; j < total_class_count; j++) {
 186                         q_values[i][j] = half_class_sizes[i - class_count] +
 187                                         half_class_sizes[j - class_count] - 1;
 188                 }
 189         }
 190
 191         /* allocate the reg-set.. */
 192         set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
 193         set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
 194         set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
 195
 196         /* .. and classes */
 197         reg = 0;
 198         for (unsigned i = 0; i < class_count; i++) {
 199                 set->classes[i] = ra_alloc_reg_class(set->regs);
 200
 201                 set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
 202
 203                 for (unsigned j = 0; j < CLASS_REGS(i); j++) {
 204                         ra_class_add_reg(set->regs, set->classes[i], reg);
 205
 206                         set->ra_reg_to_gpr[reg] = j;
 207                         set->gpr_to_ra_reg[i][j] = reg;
 208
 209                         for (unsigned br = j; br < j + class_sizes[i]; br++)
 210                                 ra_add_transitive_reg_conflict(set->regs, br, reg);
 211
 212                         reg++;
 213                 }
 214         }
 215
 216         first_half_reg = reg;
 217
 218         for (unsigned i = 0; i < half_class_count; i++) {
 219                 set->half_classes[i] = ra_alloc_reg_class(set->regs);
 220
 221                 set->gpr_to_ra_reg[class_count + i] =
 222                                 ralloc_array(set, uint16_t, CLASS_REGS(i));
 223
 224                 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
 225                         ra_class_add_reg(set->regs, set->half_classes[i], reg);
 226
 227                         set->ra_reg_to_gpr[reg] = j;
 228                         set->gpr_to_ra_reg[class_count + i][j] = reg;
 229
 230                         for (unsigned br = j; br < j + half_class_sizes[i]; br++)
 231                                 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
 232
 233                         reg++;
 234                 }
 235         }
 236
 237         ra_set_finalize(set->regs, q_values);
 238
 239         ralloc_free(q_values);
 240
 241         return set;
 242 }
 243
 244 /* additional block-data (per-block) */
 245 struct ir3_ra_block_data {
 246         BITSET_WORD *def;        /* variables defined before used in block */
 247         BITSET_WORD *use;        /* variables used before defined in block */
 248         BITSET_WORD *livein;     /* which defs reach entry point of block */
 249         BITSET_WORD *liveout;    /* which defs reach exit point of block */
 250 };
 251
 252 /* additional instruction-data (per-instruction) */
 253 struct ir3_ra_instr_data {
 254         /* cached instruction 'definer' info: */
 255         struct ir3_instruction *defn;
 256         int off, sz, cls;
 257 };
 258
 259 /* register-assign context, per-shader */
 260 struct ir3_ra_ctx {
 261         struct ir3 *ir;
 262         enum shader_t type;
 263         bool frag_face;
 264
 265         struct ir3_ra_reg_set *set;
 266         struct ra_graph *g;
 267         unsigned alloc_count;
 268         unsigned class_alloc_count[total_class_count];
 269         unsigned class_base[total_class_count];
 270         unsigned instr_cnt;
 271         unsigned *def, *use;     /* def/use table */
 272         struct ir3_ra_instr_data *instrd;
 273 };
 274
 275 static bool
 276 is_half(struct ir3_instruction *instr)
 277 {
 278         return !!(instr->regs[0]->flags & IR3_REG_HALF);
 279 }
 280
 281 static int
 282 size_to_class(unsigned sz, bool half)
 283 {
 284         if (half) {
 285                 for (unsigned i = 0; i < half_class_count; i++)
 286                         if (half_class_sizes[i] >= sz)
 287                                 return i + class_count;
 288         } else {
 289                 for (unsigned i = 0; i < class_count; i++)
 290                         if (class_sizes[i] >= sz)
 291                                 return i;
 292         }
 293         debug_assert(0);
 294         return -1;
 295 }
 296
 297 static bool
 298 is_temp(struct ir3_register *reg)
 299 {
 300         if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 301                 return false;
 302         if ((reg->num == regid(REG_A0, 0)) ||
 303                         (reg->num == regid(REG_P0, 0)))
 304                 return false;
 305         return true;
 306 }
 307
 308 static bool
 309 writes_gpr(struct ir3_instruction *instr)
 310 {
 311         if (is_store(instr))
 312                 return false;
 313         /* is dest a normal temp register: */
 314         return is_temp(instr->regs[0]);
 315 }
 316
 317 static struct ir3_instruction *
 318 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 319                 int *sz, int *off)
 320 {
 321         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 322         struct ir3_instruction *d = NULL;
 323
 324         if (instr->fanin)
 325                 return get_definer(ctx, instr->fanin, sz, off);
 326
 327         if (id->defn) {
 328                 *sz = id->sz;
 329                 *off = id->off;
 330                 return id->defn;
 331         }
 332
 333         if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 334                 /* What about the case where collect is subset of array, we
 335                  * need to find the distance between where actual array starts
 336                  * and fanin..  that probably doesn't happen currently.
 337                  */
 338                 struct ir3_register *src;
 339                 int dsz, doff;
 340
 341                 /* note: don't use foreach_ssa_src as this gets called once
 342                  * while assigning regs (which clears SSA flag)
 343                  */
 344                 foreach_src_n(src, n, instr) {
 345                         struct ir3_instruction *dd;
 346                         if (!src->instr)
 347                                 continue;
 348
 349                         dd = get_definer(ctx, src->instr, &dsz, &doff);
 350
 351                         if ((!d) || (dd->ip < d->ip)) {
 352                                 d = dd;
 353                                 *sz = dsz;
 354                                 *off = doff - n;
 355                         }
 356                 }
 357
 358         } else if (instr->cp.right || instr->cp.left) {
 359                 /* covers also the meta:fo case, which ends up w/ single
 360                  * scalar instructions for each component:
 361                  */
 362                 struct ir3_instruction *f = ir3_neighbor_first(instr);
 363
 364                 /* by definition, the entire sequence forms one linked list
 365                  * of single scalar register nodes (even if some of them may
 366                  * be fanouts from a texture sample (for example) instr.  We
 367                  * just need to walk the list finding the first element of
 368                  * the group defined (lowest ip)
 369                  */
 370                 int cnt = 0;
 371
 372                 d = f;
 373                 while (f) {
 374                         if (f->ip < d->ip)
 375                                 d = f;
 376                         if (f == instr)
 377                                 *off = cnt;
 378                         f = f->cp.right;
 379                         cnt++;
 380                 }
 381
 382                 *sz = cnt;
 383
 384         } else {
 385                 /* second case is looking directly at the instruction which
 386                  * produces multiple values (eg, texture sample), rather
 387                  * than the fanout nodes that point back to that instruction.
 388                  * This isn't quite right, because it may be part of a larger
 389                  * group, such as:
 390                  *
 391                  *     sam (f32)(xyzw)r0.x, ...
 392                  *     add r1.x, ...
 393                  *     add r1.y, ...
 394                  *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
 395                  *
 396                  * need to come up with a better way to handle that case.
 397                  */
 398                 if (instr->address) {
 399                         *sz = instr->regs[0]->size;
 400                 } else {
 401                         *sz = util_last_bit(instr->regs[0]->wrmask);
 402                 }
 403                 *off = 0;
 404                 d = instr;
 405         }
 406
 407         if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
 408                 struct ir3_instruction *phi = d->regs[0]->instr;
 409                 struct ir3_instruction *dd;
 410                 int dsz, doff;
 411
 412                 dd = get_definer(ctx, phi, &dsz, &doff);
 413
 414                 *sz = MAX2(*sz, dsz);
 415                 *off = doff;
 416
 417                 if (dd->ip < d->ip) {
 418                         d = dd;
 419                 }
 420         }
 421
 422         if (is_meta(d) && (d->opc == OPC_META_PHI)) {
 423                 /* we have already inserted parallel-copies into
 424                  * the phi, so we don't need to chase definers
 425                  */
 426                 struct ir3_register *src;
 427                 struct ir3_instruction *dd = d;
 428
 429                 /* note: don't use foreach_ssa_src as this gets called once
 430                  * while assigning regs (which clears SSA flag)
 431                  */
 432                 foreach_src(src, d) {
 433                         if (!src->instr)
 434                                 continue;
 435                         if (src->instr->ip < dd->ip)
 436                                 dd = src->instr;
 437                 }
 438
 439                 d = dd;
 440         }
 441
 442         if (is_meta(d) && (d->opc == OPC_META_FO)) {
 443                 struct ir3_instruction *dd;
 444                 int dsz, doff;
 445
 446                 dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
 447
 448                 /* by definition, should come before: */
 449                 debug_assert(dd->ip < d->ip);
 450
 451                 *sz = MAX2(*sz, dsz);
 452
 453                 /* Fanout's are grouped, so *off should already valid */
 454
 455                 d = dd;
 456         }
 457
 458         id->defn = d;
 459         id->sz = *sz;
 460         id->off = *off;
 461
 462         return d;
 463 }
 464
 465 static void
 466 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 467 {
 468         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 469                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 470                 if (instr->regs_count == 0)
 471                         continue;
 472                 /* couple special cases: */
 473                 if (writes_addr(instr) || writes_pred(instr)) {
 474                         id->cls = -1;
 475                         continue;
 476                 }
 477                 id->defn = get_definer(ctx, instr, &id->sz, &id->off);
 478                 id->cls = size_to_class(id->sz, is_half(id->defn));
 479         }
 480 }
 481
 482 /* give each instruction a name (and ip), and count up the # of names
 483  * of each class
 484  */
 485 static void
 486 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 487 {
 488         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 489                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 490
 491 #ifdef DEBUG
 492                 instr->name = ~0;
 493 #endif
 494
 495                 ctx->instr_cnt++;
 496
 497                 if (instr->regs_count == 0)
 498                         continue;
 499
 500                 if (!writes_gpr(instr))
 501                         continue;
 502
 503                 if (id->defn != instr)
 504                         continue;
 505
 506                 /* arrays which don't fit in one of the pre-defined class
 507                  * sizes are pre-colored:
 508                  *
 509                  * TODO but we still need to allocate names for them, don't we??
 510                  */
 511                 if (id->cls >= 0) {
 512                         instr->name = ctx->class_alloc_count[id->cls]++;
 513                         ctx->alloc_count++;
 514                 }
 515         }
 516 }
 517
 518 static void
 519 ra_init(struct ir3_ra_ctx *ctx)
 520 {
 521         unsigned n;
 522
 523         ir3_clear_mark(ctx->ir);
 524         n = ir3_count_instructions(ctx->ir);
 525
 526         ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
 527
 528         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 529                 ra_block_find_definers(ctx, block);
 530         }
 531
 532         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 533                 ra_block_name_instructions(ctx, block);
 534         }
 535
 536         /* figure out the base register name for each class.  The
 537          * actual ra name is class_base[cls] + instr->name;
 538          */
 539         ctx->class_base[0] = 0;
 540         for (unsigned i = 1; i < total_class_count; i++) {
 541                 ctx->class_base[i] = ctx->class_base[i-1] +
 542                                 ctx->class_alloc_count[i-1];
 543         }
 544
 545         ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
 546         ralloc_steal(ctx->g, ctx->instrd);
 547         ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 548         ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 549 }
 550
 551 static unsigned
 552 ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 553 {
 554         unsigned name;
 555         debug_assert(cls >= 0);
 556         name = ctx->class_base[cls] + defn->name;
 557         debug_assert(name < ctx->alloc_count);
 558         return name;
 559 }
 560
 561 static void
 562 ra_destroy(struct ir3_ra_ctx *ctx)
 563 {
 564         ralloc_free(ctx->g);
 565 }
 566
 567 static void
 568 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 569 {
 570         struct ir3_ra_block_data *bd;
 571         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 572
 573         bd = rzalloc(ctx->g, struct ir3_ra_block_data);
 574
 575         bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
 576         bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
 577         bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
 578         bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
 579
 580         block->bd = bd;
 581
 582         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 583                 struct ir3_instruction *src;
 584
 585                 if (instr->regs_count == 0)
 586                         continue;
 587
 588                 /* There are a couple special cases to deal with here:
 589                  *
 590                  * fanout: used to split values from a higher class to a lower
 591                  *     class, for example split the results of a texture fetch
 592                  *     into individual scalar values;  We skip over these from
 593                  *     a 'def' perspective, and for a 'use' we walk the chain
 594                  *     up to the defining instruction.
 595                  *
 596                  * fanin: used to collect values from lower class and assemble
 597                  *     them together into a higher class, for example arguments
 598                  *     to texture sample instructions;  We consider these to be
 599                  *     defined at the earliest fanin source.
 600                  *
 601                  * phi: used to merge values from different flow control paths
 602                  *     to the same reg.  Consider defined at earliest phi src,
 603                  *     and update all the other phi src's (which may come later
 604                  *     in the program) as users to extend the var's live range.
 605                  *
 606                  * Most of this, other than phi, is completely handled in the
 607                  * get_definer() helper.
 608                  *
 609                  * In either case, we trace the instruction back to the original
 610                  * definer and consider that as the def/use ip.
 611                  */
 612
 613                 if (writes_gpr(instr)) {
 614                         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 615
 616                         if (id->defn == instr) {
 617                                 /* arrays which don't fit in one of the pre-defined class
 618                                  * sizes are pre-colored:
 619                                  */
 620                                 if (id->cls >= 0) {
 621                                         unsigned name = ra_name(ctx, id->cls, id->defn);
 622
 623                                         ctx->def[name] = id->defn->ip;
 624                                         ctx->use[name] = id->defn->ip;
 625
 626                                         /* since we are in SSA at this point: */
 627                                         debug_assert(!BITSET_TEST(bd->use, name));
 628
 629                                         BITSET_SET(bd->def, name);
 630
 631                                         if (is_half(id->defn)) {
 632                                                 ra_set_node_class(ctx->g, name,
 633                                                                 ctx->set->half_classes[id->cls - class_count]);
 634                                         } else {
 635                                                 ra_set_node_class(ctx->g, name,
 636                                                                 ctx->set->classes[id->cls]);
 637                                         }
 638
 639                                         /* extend the live range for phi srcs, which may come
 640                                          * from the bottom of the loop
 641                                          */
 642                                         if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
 643                                                 struct ir3_instruction *phi = id->defn->regs[0]->instr;
 644                                                 foreach_ssa_src(src, phi) {
 645                                                         /* if src is after phi, then we need to extend
 646                                                          * the liverange to the end of src's block:
 647                                                          */
 648                                                         if (src->ip > phi->ip) {
 649                                                                 struct ir3_instruction *last =
 650                                                                         list_last_entry(&src->block->instr_list,
 651                                                                                 struct ir3_instruction, node);
 652                                                                 ctx->use[name] = MAX2(ctx->use[name], last->ip);
 653                                                         }
 654                                                 }
 655                                         }
 656                                 }
 657                         }
 658                 }
 659
 660                 foreach_ssa_src(src, instr) {
 661                         if (writes_gpr(src)) {
 662                                 struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
 663
 664                                 if (id->cls >= 0) {
 665                                         unsigned name = ra_name(ctx, id->cls, id->defn);
 666                                         ctx->use[name] = MAX2(ctx->use[name], instr->ip);
 667                                         if (!BITSET_TEST(bd->def, name))
 668                                                 BITSET_SET(bd->use, name);
 669                                 }
 670                         }
 671                 }
 672         }
 673 }
 674
 675 static bool
 676 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
 677 {
 678         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 679         bool progress = false;
 680
 681         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 682                 struct ir3_ra_block_data *bd = block->bd;
 683
 684                 /* update livein: */
 685                 for (unsigned i = 0; i < bitset_words; i++) {
 686                         BITSET_WORD new_livein =
 687                                 (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
 688
 689                         if (new_livein & ~bd->livein[i]) {
 690                                 bd->livein[i] |= new_livein;
 691                                 progress = true;
 692                         }
 693                 }
 694
 695                 /* update liveout: */
 696                 for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
 697                         struct ir3_block *succ = block->successors[j];
 698                         struct ir3_ra_block_data *succ_bd;
 699
 700                         if (!succ)
 701                                 continue;
 702
 703                         succ_bd = succ->bd;
 704
 705                         for (unsigned i = 0; i < bitset_words; i++) {
 706                                 BITSET_WORD new_liveout =
 707                                         (succ_bd->livein[i] & ~bd->liveout[i]);
 708
 709                                 if (new_liveout) {
 710                                         bd->liveout[i] |= new_liveout;
 711                                         progress = true;
 712                                 }
 713                         }
 714                 }
 715         }
 716
 717         return progress;
 718 }
 719
 720 static void
 721 ra_add_interference(struct ir3_ra_ctx *ctx)
 722 {
 723         struct ir3 *ir = ctx->ir;
 724
 725         /* compute live ranges (use/def) on a block level, also updating
 726          * block's def/use bitmasks (used below to calculate per-block
 727          * livein/liveout):
 728          */
 729         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 730                 ra_block_compute_live_ranges(ctx, block);
 731         }
 732
 733         /* update per-block livein/liveout: */
 734         while (ra_compute_livein_liveout(ctx)) {}
 735
 736         /* extend start/end ranges based on livein/liveout info from cfg: */
 737         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 738         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 739                 struct ir3_ra_block_data *bd = block->bd;
 740
 741                 for (unsigned i = 0; i < bitset_words; i++) {
 742                         if (BITSET_TEST(bd->livein, i)) {
 743                                 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
 744                                 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
 745                         }
 746
 747                         if (BITSET_TEST(bd->liveout, i)) {
 748                                 ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
 749                                 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
 750                         }
 751                 }
 752         }
 753
 754         /* need to fix things up to keep outputs live: */
 755         for (unsigned i = 0; i < ir->noutputs; i++) {
 756                 struct ir3_instruction *instr = ir->outputs[i];
 757                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 758
 759                 if (id->cls >= 0) {
 760                         unsigned name = ra_name(ctx, id->cls, id->defn);
 761                         ctx->use[name] = ctx->instr_cnt;
 762                 }
 763         }
 764
 765         for (unsigned i = 0; i < ctx->alloc_count; i++) {
 766                 for (unsigned j = 0; j < ctx->alloc_count; j++) {
 767                         if (!((ctx->def[i] >= ctx->use[j]) ||
 768                                         (ctx->def[j] >= ctx->use[i]))) {
 769                                 ra_add_node_interference(ctx->g, i, j);
 770                         }
 771                 }
 772         }
 773 }
 774
 775 /* some instructions need fix-up if dst register is half precision: */
 776 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 777 {
 778         switch (instr->category) {
 779         case 1: /* move instructions */
 780                 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
 781                 break;
 782         case 3:
 783                 switch (instr->opc) {
 784                 case OPC_MAD_F32:
 785                         instr->opc = OPC_MAD_F16;
 786                         break;
 787                 case OPC_SEL_B32:
 788                         instr->opc = OPC_SEL_B16;
 789                         break;
 790                 case OPC_SEL_S32:
 791                         instr->opc = OPC_SEL_S16;
 792                         break;
 793                 case OPC_SEL_F32:
 794                         instr->opc = OPC_SEL_F16;
 795                         break;
 796                 case OPC_SAD_S32:
 797                         instr->opc = OPC_SAD_S16;
 798                         break;
 799                 /* instructions may already be fixed up: */
 800                 case OPC_MAD_F16:
 801                 case OPC_SEL_B16:
 802                 case OPC_SEL_S16:
 803                 case OPC_SEL_F16:
 804                 case OPC_SAD_S16:
 805                         break;
 806                 default:
 807                         assert(0);
 808                         break;
 809                 }
 810                 break;
 811         case 5:
 812                 instr->cat5.type = half_type(instr->cat5.type);
 813                 break;
 814         }
 815 }
 816 /* some instructions need fix-up if src register is half precision: */
 817 static void fixup_half_instr_src(struct ir3_instruction *instr)
 818 {
 819         switch (instr->category) {
 820         case 1: /* move instructions */
 821                 instr->cat1.src_type = half_type(instr->cat1.src_type);
 822                 break;
 823         }
 824 }
 825
 826 static void
 827 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 828                 struct ir3_instruction *instr)
 829 {
 830         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 831
 832         if (id->cls >= 0) {
 833                 unsigned name = ra_name(ctx, id->cls, id->defn);
 834                 unsigned r = ra_get_node_reg(ctx->g, name);
 835                 unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 836
 837                 if (reg->flags & IR3_REG_RELATIV)
 838                         num += reg->offset;
 839
 840                 reg->num = num;
 841                 reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 842
 843                 if (is_half(id->defn))
 844                         reg->flags |= IR3_REG_HALF;
 845         }
 846 }
 847
 848 static void
 849 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 850 {
 851         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 852                 struct ir3_register *reg;
 853
 854                 if (instr->regs_count == 0)
 855                         continue;
 856
 857                 if (writes_gpr(instr)) {
 858                         reg_assign(ctx, instr->regs[0], instr);
 859                         if (instr->regs[0]->flags & IR3_REG_HALF)
 860                                 fixup_half_instr_dst(instr);
 861                 }
 862
 863                 foreach_src_n(reg, n, instr) {
 864                         struct ir3_instruction *src = reg->instr;
 865                         if (!src)
 866                                 continue;
 867
 868                         reg_assign(ctx, instr->regs[n+1], src);
 869                         if (instr->regs[n+1]->flags & IR3_REG_HALF)
 870                                 fixup_half_instr_src(instr);
 871                 }
 872         }
 873 }
 874
 875 static int
 876 ra_alloc(struct ir3_ra_ctx *ctx)
 877 {
 878         /* frag shader inputs get pre-assigned, since we have some
 879          * constraints/unknowns about setup for some of these regs:
 880          */
 881         if (ctx->type == SHADER_FRAGMENT) {
 882                 struct ir3 *ir = ctx->ir;
 883                 unsigned i = 0, j;
 884                 if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 885                         struct ir3_instruction *instr = ir->inputs[i];
 886                         int cls = size_to_class(1, true);
 887                         unsigned name = ra_name(ctx, cls, instr);
 888                         unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 889
 890                         /* if we have frag_face, it gets hr0.x */
 891                         ra_set_node_reg(ctx->g, name, reg);
 892                         i += 4;
 893                 }
 894
 895                 for (j = 0; i < ir->ninputs; i++) {
 896                         struct ir3_instruction *instr = ir->inputs[i];
 897                         if (instr) {
 898                                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 899
 900                                 if (id->defn == instr) {
 901                                         unsigned name, reg;
 902
 903                                         name = ra_name(ctx, id->cls, id->defn);
 904                                         reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 905
 906                                         ra_set_node_reg(ctx->g, name, reg);
 907                                         j += id->sz;
 908                                 }
 909                         }
 910                 }
 911         }
 912
 913         if (!ra_allocate(ctx->g))
 914                 return -1;
 915
 916         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 917                 ra_block_alloc(ctx, block);
 918         }
 919
 920         return 0;
 921 }
 922
 923 int ir3_ra(struct ir3 *ir, enum shader_t type,
 924                 bool frag_coord, bool frag_face)
 925 {
 926         struct ir3_ra_ctx ctx = {
 927                         .ir = ir,
 928                         .type = type,
 929                         .frag_face = frag_face,
 930                         .set = ir->compiler->set,
 931         };
 932         int ret;
 933
 934         ra_init(&ctx);
 935         ra_add_interference(&ctx);
 936         ret = ra_alloc(&ctx);
 937         ra_destroy(&ctx);
 938
 939         return ret;
 940 }