src/gallium/drivers/freedreno/ir3/ir3_ra.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29 #include "util/u_math.h"
  30 #include "util/register_allocate.h"
  31 #include "util/ralloc.h"
  32 #include "util/bitset.h"
  33
  34 #include "ir3.h"
  35 #include "ir3_compiler.h"
  36
  37 /*
  38  * Register Assignment:
  39  *
  40  * Uses the register_allocate util, which implements graph coloring
  41  * algo with interference classes.  To handle the cases where we need
  42  * consecutive registers (for example, texture sample instructions),
  43  * we model these as larger (double/quad/etc) registers which conflict
  44  * with the corresponding registers in other classes.
  45  *
  46  * Additionally we create additional classes for half-regs, which
  47  * do not conflict with the full-reg classes.  We do need at least
  48  * sizes 1-4 (to deal w/ texture sample instructions output to half-
  49  * reg).  At the moment we don't create the higher order half-reg
  50  * classes as half-reg frequently does not have enough precision
  51  * for texture coords at higher resolutions.
  52  *
  53  * There are some additional cases that we need to handle specially,
  54  * as the graph coloring algo doesn't understand "partial writes".
  55  * For example, a sequence like:
  56  *
  57  *   add r0.z, ...
  58  *   sam (f32)(xy)r0.x, ...
  59  *   ...
  60  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
  61  *
  62  * In this scenario, we treat r0.xyz as class size 3, which is written
  63  * (from a use/def perspective) at the 'add' instruction and ignore the
  64  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
  65  * defining instruction, as it is the first to partially write r0.xyz.
  66  *
  67  * Note i965 has a similar scenario, which they solve with a virtual
  68  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
  69  * register assignment.  But for us that is horrible from a scheduling
  70  * standpoint.  Instead what we do is use idea of 'definer' instruction.
  71  * Ie. the first instruction (lowest ip) to write to the array is the
  72  * one we consider from use/def perspective when building interference
  73  * graph.  (Other instructions which write other array elements just
  74  * define the variable some more.)
  75  */
  76
  77 static const unsigned class_sizes[] = {
  78         1, 2, 3, 4,
  79         4 + 4, /* txd + 1d/2d */
  80         4 + 6, /* txd + 3d */
  81         /* temporary: until we can assign arrays, create classes so we
  82          * can round up array to fit.  NOTE with tgsi arrays should
  83          * really all be multiples of four:
  84          */
  85         4 * 4,
  86         4 * 8,
  87         4 * 16,
  88         4 * 32,
  89
  90 };
  91 #define class_count ARRAY_SIZE(class_sizes)
  92
  93 static const unsigned half_class_sizes[] = {
  94         1, 2, 3, 4,
  95 };
  96 #define half_class_count  ARRAY_SIZE(half_class_sizes)
  97 #define total_class_count (class_count + half_class_count)
  98
  99 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
 100 #define NUM_REGS             (4 * (REG_A0 - 1))
 101 /* Number of virtual regs in a given class: */
 102 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
 103 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
 104
 105 /* register-set, created one time, used for all shaders: */
 106 struct ir3_ra_reg_set {
 107         struct ra_regs *regs;
 108         unsigned int classes[class_count];
 109         unsigned int half_classes[half_class_count];
 110         /* maps flat virtual register space to base gpr: */
 111         uint16_t *ra_reg_to_gpr;
 112         /* maps cls,gpr to flat virtual register space: */
 113         uint16_t **gpr_to_ra_reg;
 114 };
 115
 116 /* One-time setup of RA register-set, which describes all the possible
 117  * "virtual" registers and their interferences.  Ie. double register
 118  * occupies (and conflicts with) two single registers, and so forth.
 119  * Since registers do not need to be aligned to their class size, they
 120  * can conflict with other registers in the same class too.  Ie:
 121  *
 122  *    Single (base) |  Double
 123  *    --------------+---------------
 124  *       R0         |  D0
 125  *       R1         |  D0 D1
 126  *       R2         |     D1 D2
 127  *       R3         |        D2
 128  *           .. and so on..
 129  *
 130  * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
 131  * really just four scalar registers.  Don't let that confuse you.)
 132  */
 133 struct ir3_ra_reg_set *
 134 ir3_ra_alloc_reg_set(void *memctx)
 135 {
 136         struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
 137         unsigned ra_reg_count, reg, first_half_reg;
 138         unsigned int **q_values;
 139
 140         /* calculate # of regs across all classes: */
 141         ra_reg_count = 0;
 142         for (unsigned i = 0; i < class_count; i++)
 143                 ra_reg_count += CLASS_REGS(i);
 144         for (unsigned i = 0; i < half_class_count; i++)
 145                 ra_reg_count += HALF_CLASS_REGS(i);
 146
 147         /* allocate and populate q_values: */
 148         q_values = ralloc_array(set, unsigned *, total_class_count);
 149         for (unsigned i = 0; i < class_count; i++) {
 150                 q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
 151
 152                 /* From register_allocate.c:
 153                  *
 154                  * q(B,C) (indexed by C, B is this register class) in
 155                  * Runeson/Nyström paper.  This is "how many registers of B could
 156                  * the worst choice register from C conflict with".
 157                  *
 158                  * If we just let the register allocation algorithm compute these
 159                  * values, is extremely expensive.  However, since all of our
 160                  * registers are laid out, we can very easily compute them
 161                  * ourselves.  View the register from C as fixed starting at GRF n
 162                  * somewhere in the middle, and the register from B as sliding back
 163                  * and forth.  Then the first register to conflict from B is the
 164                  * one starting at n - class_size[B] + 1 and the last register to
 165                  * conflict will start at n + class_size[B] - 1.  Therefore, the
 166                  * number of conflicts from B is class_size[B] + class_size[C] - 1.
 167                  *
 168                  *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
 169                  * B | | | | | |n| --> | | | | | | |
 170                  *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
 171                  *             +-+-+-+-+-+
 172                  * C           |n| | | | |
 173                  *             +-+-+-+-+-+
 174                  *
 175                  * (Idea copied from brw_fs_reg_allocate.cpp)
 176                  */
 177                 for (unsigned j = 0; j < class_count; j++)
 178                         q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
 179         }
 180
 181         for (unsigned i = class_count; i < total_class_count; i++) {
 182                 q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
 183
 184                 /* see comment above: */
 185                 for (unsigned j = class_count; j < total_class_count; j++) {
 186                         q_values[i][j] = half_class_sizes[i - class_count] +
 187                                         half_class_sizes[j - class_count] - 1;
 188                 }
 189         }
 190
 191         /* allocate the reg-set.. */
 192         set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
 193         set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
 194         set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
 195
 196         /* .. and classes */
 197         reg = 0;
 198         for (unsigned i = 0; i < class_count; i++) {
 199                 set->classes[i] = ra_alloc_reg_class(set->regs);
 200
 201                 set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
 202
 203                 for (unsigned j = 0; j < CLASS_REGS(i); j++) {
 204                         ra_class_add_reg(set->regs, set->classes[i], reg);
 205
 206                         set->ra_reg_to_gpr[reg] = j;
 207                         set->gpr_to_ra_reg[i][j] = reg;
 208
 209                         for (unsigned br = j; br < j + class_sizes[i]; br++)
 210                                 ra_add_transitive_reg_conflict(set->regs, br, reg);
 211
 212                         reg++;
 213                 }
 214         }
 215
 216         first_half_reg = reg;
 217
 218         for (unsigned i = 0; i < half_class_count; i++) {
 219                 set->half_classes[i] = ra_alloc_reg_class(set->regs);
 220
 221                 set->gpr_to_ra_reg[class_count + i] =
 222                                 ralloc_array(set, uint16_t, CLASS_REGS(i));
 223
 224                 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
 225                         ra_class_add_reg(set->regs, set->half_classes[i], reg);
 226
 227                         set->ra_reg_to_gpr[reg] = j;
 228                         set->gpr_to_ra_reg[class_count + i][j] = reg;
 229
 230                         for (unsigned br = j; br < j + half_class_sizes[i]; br++)
 231                                 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
 232
 233                         reg++;
 234                 }
 235         }
 236
 237         ra_set_finalize(set->regs, q_values);
 238
 239         ralloc_free(q_values);
 240
 241         return set;
 242 }
 243
 244 /* additional block-data (per-block) */
 245 struct ir3_ra_block_data {
 246         BITSET_WORD *def;        /* variables defined before used in block */
 247         BITSET_WORD *use;        /* variables used before defined in block */
 248         BITSET_WORD *livein;     /* which defs reach entry point of block */
 249         BITSET_WORD *liveout;    /* which defs reach exit point of block */
 250 };
 251
 252 /* additional instruction-data (per-instruction) */
 253 struct ir3_ra_instr_data {
 254         /* cached instruction 'definer' info: */
 255         struct ir3_instruction *defn;
 256         int off, sz, cls;
 257 };
 258
 259 /* register-assign context, per-shader */
 260 struct ir3_ra_ctx {
 261         struct ir3 *ir;
 262         enum shader_t type;
 263         bool frag_face;
 264
 265         struct ir3_ra_reg_set *set;
 266         struct ra_graph *g;
 267         unsigned alloc_count;
 268         unsigned class_alloc_count[total_class_count];
 269         unsigned class_base[total_class_count];
 270         unsigned instr_cnt;
 271         unsigned *def, *use;     /* def/use table */
 272         struct ir3_ra_instr_data *instrd;
 273 };
 274
 275 static bool
 276 is_half(struct ir3_instruction *instr)
 277 {
 278         return !!(instr->regs[0]->flags & IR3_REG_HALF);
 279 }
 280
 281 static int
 282 size_to_class(unsigned sz, bool half)
 283 {
 284         if (half) {
 285                 for (unsigned i = 0; i < half_class_count; i++)
 286                         if (half_class_sizes[i] >= sz)
 287                                 return i + class_count;
 288         } else {
 289                 for (unsigned i = 0; i < class_count; i++)
 290                         if (class_sizes[i] >= sz)
 291                                 return i;
 292         }
 293         debug_assert(0);
 294         return -1;
 295 }
 296
 297 static bool
 298 is_temp(struct ir3_register *reg)
 299 {
 300         if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 301                 return false;
 302         if ((reg->num == regid(REG_A0, 0)) ||
 303                         (reg->num == regid(REG_P0, 0)))
 304                 return false;
 305         return true;
 306 }
 307
 308 static bool
 309 writes_gpr(struct ir3_instruction *instr)
 310 {
 311         if (is_store(instr))
 312                 return false;
 313         /* is dest a normal temp register: */
 314         return is_temp(instr->regs[0]);
 315 }
 316
 317 static bool
 318 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
 319 {
 320         if (a->flags & IR3_INSTR_UNUSED)
 321                 return false;
 322         return (a->ip < b->ip);
 323 }
 324
 325 static struct ir3_instruction *
 326 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 327                 int *sz, int *off)
 328 {
 329         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 330         struct ir3_instruction *d = NULL;
 331
 332         if (instr->fanin)
 333                 return get_definer(ctx, instr->fanin, sz, off);
 334
 335         if (id->defn) {
 336                 *sz = id->sz;
 337                 *off = id->off;
 338                 return id->defn;
 339         }
 340
 341         if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
 342                 /* What about the case where collect is subset of array, we
 343                  * need to find the distance between where actual array starts
 344                  * and fanin..  that probably doesn't happen currently.
 345                  */
 346                 struct ir3_register *src;
 347                 int dsz, doff;
 348
 349                 /* note: don't use foreach_ssa_src as this gets called once
 350                  * while assigning regs (which clears SSA flag)
 351                  */
 352                 foreach_src_n(src, n, instr) {
 353                         struct ir3_instruction *dd;
 354                         if (!src->instr)
 355                                 continue;
 356
 357                         dd = get_definer(ctx, src->instr, &dsz, &doff);
 358
 359                         if ((!d) || instr_before(dd, d)) {
 360                                 d = dd;
 361                                 *sz = dsz;
 362                                 *off = doff - n;
 363                         }
 364                 }
 365
 366         } else if (instr->cp.right || instr->cp.left) {
 367                 /* covers also the meta:fo case, which ends up w/ single
 368                  * scalar instructions for each component:
 369                  */
 370                 struct ir3_instruction *f = ir3_neighbor_first(instr);
 371
 372                 /* by definition, the entire sequence forms one linked list
 373                  * of single scalar register nodes (even if some of them may
 374                  * be fanouts from a texture sample (for example) instr.  We
 375                  * just need to walk the list finding the first element of
 376                  * the group defined (lowest ip)
 377                  */
 378                 int cnt = 0;
 379
 380                 /* need to skip over unused in the group: */
 381                 while (f && (f->flags & IR3_INSTR_UNUSED)) {
 382                         f = f->cp.right;
 383                         cnt++;
 384                 }
 385
 386                 while (f) {
 387                         if ((!d) || instr_before(f, d))
 388                                 d = f;
 389                         if (f == instr)
 390                                 *off = cnt;
 391                         f = f->cp.right;
 392                         cnt++;
 393                 }
 394
 395                 *sz = cnt;
 396
 397         } else {
 398                 /* second case is looking directly at the instruction which
 399                  * produces multiple values (eg, texture sample), rather
 400                  * than the fanout nodes that point back to that instruction.
 401                  * This isn't quite right, because it may be part of a larger
 402                  * group, such as:
 403                  *
 404                  *     sam (f32)(xyzw)r0.x, ...
 405                  *     add r1.x, ...
 406                  *     add r1.y, ...
 407                  *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
 408                  *
 409                  * need to come up with a better way to handle that case.
 410                  */
 411                 if (instr->address) {
 412                         *sz = instr->regs[0]->size;
 413                 } else {
 414                         *sz = util_last_bit(instr->regs[0]->wrmask);
 415                 }
 416                 *off = 0;
 417                 d = instr;
 418         }
 419
 420         if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
 421                 struct ir3_instruction *phi = d->regs[0]->instr;
 422                 struct ir3_instruction *dd;
 423                 int dsz, doff;
 424
 425                 dd = get_definer(ctx, phi, &dsz, &doff);
 426
 427                 *sz = MAX2(*sz, dsz);
 428                 *off = doff;
 429
 430                 if (instr_before(dd, d)) {
 431                         d = dd;
 432                 }
 433         }
 434
 435         if (is_meta(d) && (d->opc == OPC_META_PHI)) {
 436                 /* we have already inserted parallel-copies into
 437                  * the phi, so we don't need to chase definers
 438                  */
 439                 struct ir3_register *src;
 440                 struct ir3_instruction *dd = d;
 441
 442                 /* note: don't use foreach_ssa_src as this gets called once
 443                  * while assigning regs (which clears SSA flag)
 444                  */
 445                 foreach_src(src, d) {
 446                         if (!src->instr)
 447                                 continue;
 448                         if (instr_before(src->instr, dd))
 449                                 dd = src->instr;
 450                 }
 451
 452                 d = dd;
 453         }
 454
 455         if (is_meta(d) && (d->opc == OPC_META_FO)) {
 456                 struct ir3_instruction *dd;
 457                 int dsz, doff;
 458
 459                 dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
 460
 461                 /* by definition, should come before: */
 462                 debug_assert(instr_before(dd, d));
 463
 464                 *sz = MAX2(*sz, dsz);
 465
 466                 /* Fanout's are grouped, so *off should already valid */
 467
 468                 d = dd;
 469         }
 470
 471         id->defn = d;
 472         id->sz = *sz;
 473         id->off = *off;
 474
 475         return d;
 476 }
 477
 478 static void
 479 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 480 {
 481         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 482                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 483                 if (instr->regs_count == 0)
 484                         continue;
 485                 /* couple special cases: */
 486                 if (writes_addr(instr) || writes_pred(instr)) {
 487                         id->cls = -1;
 488                         continue;
 489                 }
 490                 id->defn = get_definer(ctx, instr, &id->sz, &id->off);
 491                 id->cls = size_to_class(id->sz, is_half(id->defn));
 492         }
 493 }
 494
 495 /* give each instruction a name (and ip), and count up the # of names
 496  * of each class
 497  */
 498 static void
 499 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 500 {
 501         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 502                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 503
 504 #ifdef DEBUG
 505                 instr->name = ~0;
 506 #endif
 507
 508                 ctx->instr_cnt++;
 509
 510                 if (instr->regs_count == 0)
 511                         continue;
 512
 513                 if (!writes_gpr(instr))
 514                         continue;
 515
 516                 if (id->defn != instr)
 517                         continue;
 518
 519                 /* arrays which don't fit in one of the pre-defined class
 520                  * sizes are pre-colored:
 521                  *
 522                  * TODO but we still need to allocate names for them, don't we??
 523                  */
 524                 if (id->cls >= 0) {
 525                         instr->name = ctx->class_alloc_count[id->cls]++;
 526                         ctx->alloc_count++;
 527                 }
 528         }
 529 }
 530
 531 static void
 532 ra_init(struct ir3_ra_ctx *ctx)
 533 {
 534         unsigned n;
 535
 536         ir3_clear_mark(ctx->ir);
 537         n = ir3_count_instructions(ctx->ir);
 538
 539         ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
 540
 541         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 542                 ra_block_find_definers(ctx, block);
 543         }
 544
 545         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 546                 ra_block_name_instructions(ctx, block);
 547         }
 548
 549         /* figure out the base register name for each class.  The
 550          * actual ra name is class_base[cls] + instr->name;
 551          */
 552         ctx->class_base[0] = 0;
 553         for (unsigned i = 1; i < total_class_count; i++) {
 554                 ctx->class_base[i] = ctx->class_base[i-1] +
 555                                 ctx->class_alloc_count[i-1];
 556         }
 557
 558         ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
 559         ralloc_steal(ctx->g, ctx->instrd);
 560         ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 561         ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
 562 }
 563
 564 static unsigned
 565 ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 566 {
 567         unsigned name;
 568         debug_assert(cls >= 0);
 569         name = ctx->class_base[cls] + defn->name;
 570         debug_assert(name < ctx->alloc_count);
 571         return name;
 572 }
 573
 574 static void
 575 ra_destroy(struct ir3_ra_ctx *ctx)
 576 {
 577         ralloc_free(ctx->g);
 578 }
 579
 580 static void
 581 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 582 {
 583         struct ir3_ra_block_data *bd;
 584         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 585
 586         bd = rzalloc(ctx->g, struct ir3_ra_block_data);
 587
 588         bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
 589         bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
 590         bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
 591         bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
 592
 593         block->data = bd;
 594
 595         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 596                 struct ir3_instruction *src;
 597
 598                 if (instr->regs_count == 0)
 599                         continue;
 600
 601                 /* There are a couple special cases to deal with here:
 602                  *
 603                  * fanout: used to split values from a higher class to a lower
 604                  *     class, for example split the results of a texture fetch
 605                  *     into individual scalar values;  We skip over these from
 606                  *     a 'def' perspective, and for a 'use' we walk the chain
 607                  *     up to the defining instruction.
 608                  *
 609                  * fanin: used to collect values from lower class and assemble
 610                  *     them together into a higher class, for example arguments
 611                  *     to texture sample instructions;  We consider these to be
 612                  *     defined at the earliest fanin source.
 613                  *
 614                  * phi: used to merge values from different flow control paths
 615                  *     to the same reg.  Consider defined at earliest phi src,
 616                  *     and update all the other phi src's (which may come later
 617                  *     in the program) as users to extend the var's live range.
 618                  *
 619                  * Most of this, other than phi, is completely handled in the
 620                  * get_definer() helper.
 621                  *
 622                  * In either case, we trace the instruction back to the original
 623                  * definer and consider that as the def/use ip.
 624                  */
 625
 626                 if (writes_gpr(instr)) {
 627                         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 628
 629                         if (id->defn == instr) {
 630                                 /* arrays which don't fit in one of the pre-defined class
 631                                  * sizes are pre-colored:
 632                                  */
 633                                 if (id->cls >= 0) {
 634                                         unsigned name = ra_name(ctx, id->cls, id->defn);
 635
 636                                         ctx->def[name] = id->defn->ip;
 637                                         ctx->use[name] = id->defn->ip;
 638
 639                                         /* since we are in SSA at this point: */
 640                                         debug_assert(!BITSET_TEST(bd->use, name));
 641
 642                                         BITSET_SET(bd->def, name);
 643
 644                                         if (is_half(id->defn)) {
 645                                                 ra_set_node_class(ctx->g, name,
 646                                                                 ctx->set->half_classes[id->cls - class_count]);
 647                                         } else {
 648                                                 ra_set_node_class(ctx->g, name,
 649                                                                 ctx->set->classes[id->cls]);
 650                                         }
 651
 652                                         /* extend the live range for phi srcs, which may come
 653                                          * from the bottom of the loop
 654                                          */
 655                                         if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
 656                                                 struct ir3_instruction *phi = id->defn->regs[0]->instr;
 657                                                 foreach_ssa_src(src, phi) {
 658                                                         /* if src is after phi, then we need to extend
 659                                                          * the liverange to the end of src's block:
 660                                                          */
 661                                                         if (src->ip > phi->ip) {
 662                                                                 struct ir3_instruction *last =
 663                                                                         list_last_entry(&src->block->instr_list,
 664                                                                                 struct ir3_instruction, node);
 665                                                                 ctx->use[name] = MAX2(ctx->use[name], last->ip);
 666                                                         }
 667                                                 }
 668                                         }
 669                                 }
 670                         }
 671                 }
 672
 673                 foreach_ssa_src(src, instr) {
 674                         if (writes_gpr(src)) {
 675                                 struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
 676
 677                                 if (id->cls >= 0) {
 678                                         unsigned name = ra_name(ctx, id->cls, id->defn);
 679                                         ctx->use[name] = MAX2(ctx->use[name], instr->ip);
 680                                         if (!BITSET_TEST(bd->def, name))
 681                                                 BITSET_SET(bd->use, name);
 682                                 }
 683                         }
 684                 }
 685         }
 686 }
 687
 688 static bool
 689 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
 690 {
 691         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 692         bool progress = false;
 693
 694         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 695                 struct ir3_ra_block_data *bd = block->data;
 696
 697                 /* update livein: */
 698                 for (unsigned i = 0; i < bitset_words; i++) {
 699                         BITSET_WORD new_livein =
 700                                 (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
 701
 702                         if (new_livein & ~bd->livein[i]) {
 703                                 bd->livein[i] |= new_livein;
 704                                 progress = true;
 705                         }
 706                 }
 707
 708                 /* update liveout: */
 709                 for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
 710                         struct ir3_block *succ = block->successors[j];
 711                         struct ir3_ra_block_data *succ_bd;
 712
 713                         if (!succ)
 714                                 continue;
 715
 716                         succ_bd = succ->data;
 717
 718                         for (unsigned i = 0; i < bitset_words; i++) {
 719                                 BITSET_WORD new_liveout =
 720                                         (succ_bd->livein[i] & ~bd->liveout[i]);
 721
 722                                 if (new_liveout) {
 723                                         bd->liveout[i] |= new_liveout;
 724                                         progress = true;
 725                                 }
 726                         }
 727                 }
 728         }
 729
 730         return progress;
 731 }
 732
 733 static void
 734 ra_add_interference(struct ir3_ra_ctx *ctx)
 735 {
 736         struct ir3 *ir = ctx->ir;
 737
 738         /* compute live ranges (use/def) on a block level, also updating
 739          * block's def/use bitmasks (used below to calculate per-block
 740          * livein/liveout):
 741          */
 742         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 743                 ra_block_compute_live_ranges(ctx, block);
 744         }
 745
 746         /* update per-block livein/liveout: */
 747         while (ra_compute_livein_liveout(ctx)) {}
 748
 749         /* extend start/end ranges based on livein/liveout info from cfg: */
 750         unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
 751         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 752                 struct ir3_ra_block_data *bd = block->data;
 753
 754                 for (unsigned i = 0; i < bitset_words; i++) {
 755                         if (BITSET_TEST(bd->livein, i)) {
 756                                 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
 757                                 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
 758                         }
 759
 760                         if (BITSET_TEST(bd->liveout, i)) {
 761                                 ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
 762                                 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
 763                         }
 764                 }
 765         }
 766
 767         /* need to fix things up to keep outputs live: */
 768         for (unsigned i = 0; i < ir->noutputs; i++) {
 769                 struct ir3_instruction *instr = ir->outputs[i];
 770                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 771
 772                 if (id->cls >= 0) {
 773                         unsigned name = ra_name(ctx, id->cls, id->defn);
 774                         ctx->use[name] = ctx->instr_cnt;
 775                 }
 776         }
 777
 778         for (unsigned i = 0; i < ctx->alloc_count; i++) {
 779                 for (unsigned j = 0; j < ctx->alloc_count; j++) {
 780                         if (!((ctx->def[i] >= ctx->use[j]) ||
 781                                         (ctx->def[j] >= ctx->use[i]))) {
 782                                 ra_add_node_interference(ctx->g, i, j);
 783                         }
 784                 }
 785         }
 786 }
 787
 788 /* some instructions need fix-up if dst register is half precision: */
 789 static void fixup_half_instr_dst(struct ir3_instruction *instr)
 790 {
 791         switch (instr->category) {
 792         case 1: /* move instructions */
 793                 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
 794                 break;
 795         case 3:
 796                 switch (instr->opc) {
 797                 case OPC_MAD_F32:
 798                         instr->opc = OPC_MAD_F16;
 799                         break;
 800                 case OPC_SEL_B32:
 801                         instr->opc = OPC_SEL_B16;
 802                         break;
 803                 case OPC_SEL_S32:
 804                         instr->opc = OPC_SEL_S16;
 805                         break;
 806                 case OPC_SEL_F32:
 807                         instr->opc = OPC_SEL_F16;
 808                         break;
 809                 case OPC_SAD_S32:
 810                         instr->opc = OPC_SAD_S16;
 811                         break;
 812                 /* instructions may already be fixed up: */
 813                 case OPC_MAD_F16:
 814                 case OPC_SEL_B16:
 815                 case OPC_SEL_S16:
 816                 case OPC_SEL_F16:
 817                 case OPC_SAD_S16:
 818                         break;
 819                 default:
 820                         assert(0);
 821                         break;
 822                 }
 823                 break;
 824         case 5:
 825                 instr->cat5.type = half_type(instr->cat5.type);
 826                 break;
 827         }
 828 }
 829 /* some instructions need fix-up if src register is half precision: */
 830 static void fixup_half_instr_src(struct ir3_instruction *instr)
 831 {
 832         switch (instr->category) {
 833         case 1: /* move instructions */
 834                 instr->cat1.src_type = half_type(instr->cat1.src_type);
 835                 break;
 836         }
 837 }
 838
 839 static void
 840 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 841                 struct ir3_instruction *instr)
 842 {
 843         struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 844
 845         if (id->cls >= 0) {
 846                 unsigned name = ra_name(ctx, id->cls, id->defn);
 847                 unsigned r = ra_get_node_reg(ctx->g, name);
 848                 unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
 849
 850                 if (reg->flags & IR3_REG_RELATIV)
 851                         num += reg->offset;
 852
 853                 reg->num = num;
 854                 reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 855
 856                 if (is_half(id->defn))
 857                         reg->flags |= IR3_REG_HALF;
 858         }
 859 }
 860
 861 static void
 862 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 863 {
 864         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 865                 struct ir3_register *reg;
 866
 867                 if (instr->regs_count == 0)
 868                         continue;
 869
 870                 if (writes_gpr(instr)) {
 871                         reg_assign(ctx, instr->regs[0], instr);
 872                         if (instr->regs[0]->flags & IR3_REG_HALF)
 873                                 fixup_half_instr_dst(instr);
 874                 }
 875
 876                 foreach_src_n(reg, n, instr) {
 877                         struct ir3_instruction *src = reg->instr;
 878                         if (!src)
 879                                 continue;
 880
 881                         reg_assign(ctx, instr->regs[n+1], src);
 882                         if (instr->regs[n+1]->flags & IR3_REG_HALF)
 883                                 fixup_half_instr_src(instr);
 884                 }
 885         }
 886 }
 887
 888 static int
 889 ra_alloc(struct ir3_ra_ctx *ctx)
 890 {
 891         /* frag shader inputs get pre-assigned, since we have some
 892          * constraints/unknowns about setup for some of these regs:
 893          */
 894         if (ctx->type == SHADER_FRAGMENT) {
 895                 struct ir3 *ir = ctx->ir;
 896                 unsigned i = 0, j;
 897                 if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 898                         struct ir3_instruction *instr = ir->inputs[i];
 899                         int cls = size_to_class(1, true);
 900                         unsigned name = ra_name(ctx, cls, instr);
 901                         unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
 902
 903                         /* if we have frag_face, it gets hr0.x */
 904                         ra_set_node_reg(ctx->g, name, reg);
 905                         i += 4;
 906                 }
 907
 908                 for (j = 0; i < ir->ninputs; i++) {
 909                         struct ir3_instruction *instr = ir->inputs[i];
 910                         if (instr) {
 911                                 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 912
 913                                 if (id->defn == instr) {
 914                                         unsigned name, reg;
 915
 916                                         name = ra_name(ctx, id->cls, id->defn);
 917                                         reg = ctx->set->gpr_to_ra_reg[id->cls][j];
 918
 919                                         ra_set_node_reg(ctx->g, name, reg);
 920                                         j += id->sz;
 921                                 }
 922                         }
 923                 }
 924         }
 925
 926         if (!ra_allocate(ctx->g))
 927                 return -1;
 928
 929         list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
 930                 ra_block_alloc(ctx, block);
 931         }
 932
 933         return 0;
 934 }
 935
 936 int ir3_ra(struct ir3 *ir, enum shader_t type,
 937                 bool frag_coord, bool frag_face)
 938 {
 939         struct ir3_ra_ctx ctx = {
 940                         .ir = ir,
 941                         .type = type,
 942                         .frag_face = frag_face,
 943                         .set = ir->compiler->set,
 944         };
 945         int ret;
 946
 947         ra_init(&ctx);
 948         ra_add_interference(&ctx);
 949         ret = ra_alloc(&ctx);
 950         ra_destroy(&ctx);
 951
 952         return ret;
 953 }