src/gallium/drivers/freedreno/ir3/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 enum {
  35         SCHEDULED = -1,
  36         DELAYED = -2,
  37 };
  38
  39 /*
  40  * Instruction Scheduling:
  41  *
  42  * Using the depth sorted list from depth pass, attempt to recursively
  43  * schedule deepest unscheduled path.  The first instruction that cannot
  44  * be scheduled, returns the required delay slots it needs, at which
  45  * point we return back up to the top and attempt to schedule by next
  46  * highest depth.  After a sufficient number of instructions have been
  47  * scheduled, return back to beginning of list and start again.  If you
  48  * reach the end of depth sorted list without being able to insert any
  49  * instruction, insert nop's.  Repeat until no more unscheduled
  50  * instructions.
  51  *
  52  * There are a few special cases that need to be handled, since sched
  53  * is currently independent of register allocation.  Usages of address
  54  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  55  * if you have two pairs of instructions that write the same special
  56  * register and then read it, then those pairs cannot be interleaved.
  57  * To solve this, when we are in such a scheduling "critical section",
  58  * and we encounter a conflicting write to a special register, we try
  59  * to schedule any remaining instructions that use that value first.
  60  */
  61
  62 struct ir3_sched_ctx {
  63         struct ir3_instruction *scheduled; /* last scheduled instr */
  64         struct ir3_instruction *addr;      /* current a0.x user, if any */
  65         struct ir3_instruction *pred;      /* current p0.x user, if any */
  66         unsigned cnt;
  67         bool error;
  68 };
  69
  70 static struct ir3_instruction *
  71 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
  72 {
  73         struct ir3_instruction *d = NULL;
  74         unsigned i = 0, id = 0;
  75
  76         while ((i < nsrcs) && !(d = srcs[id = i]))
  77                 i++;
  78
  79         if (!d)
  80                 return NULL;
  81
  82         for (; i < nsrcs; i++)
  83                 if (srcs[i] && (srcs[i]->depth > d->depth))
  84                         d = srcs[id = i];
  85
  86         srcs[id] = NULL;
  87
  88         return d;
  89 }
  90
  91 static unsigned distance(struct ir3_sched_ctx *ctx,
  92                 struct ir3_instruction *instr, unsigned maxd)
  93 {
  94         struct ir3_instruction *n = ctx->scheduled;
  95         unsigned d = 0;
  96         while (n && (n != instr) && (d < maxd)) {
  97                 if (is_alu(n) || is_flow(n))
  98                         d++;
  99                 n = n->next;
 100         }
 101         return d;
 102 }
 103
 104 /* TODO maybe we want double linked list? */
 105 static struct ir3_instruction * prev(struct ir3_instruction *instr)
 106 {
 107         struct ir3_instruction *p = instr->block->head;
 108         while (p && (p->next != instr))
 109                 p = p->next;
 110         return p;
 111 }
 112
 113 static bool is_sfu_or_mem(struct ir3_instruction *instr)
 114 {
 115         return is_sfu(instr) || is_mem(instr);
 116 }
 117
 118 static void schedule(struct ir3_sched_ctx *ctx,
 119                 struct ir3_instruction *instr, bool remove)
 120 {
 121         struct ir3_block *block = instr->block;
 122
 123         /* maybe there is a better way to handle this than just stuffing
 124          * a nop.. ideally we'd know about this constraint in the
 125          * scheduling and depth calculation..
 126          */
 127         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
 128                 schedule(ctx, ir3_NOP(block), false);
 129
 130         /* remove from depth list:
 131          */
 132         if (remove) {
 133                 struct ir3_instruction *p = prev(instr);
 134
 135                 /* NOTE: this can happen for inputs which are not
 136                  * read.. in that case there is no need to schedule
 137                  * the input, so just bail:
 138                  */
 139                 if (instr != (p ? p->next : block->head))
 140                         return;
 141
 142                 if (p)
 143                         p->next = instr->next;
 144                 else
 145                         block->head = instr->next;
 146         }
 147
 148         if (writes_addr(instr)) {
 149                 assert(ctx->addr == NULL);
 150                 ctx->addr = instr;
 151         }
 152
 153         if (writes_pred(instr)) {
 154                 assert(ctx->pred == NULL);
 155                 ctx->pred = instr;
 156         }
 157
 158         instr->flags |= IR3_INSTR_MARK;
 159
 160         instr->next = ctx->scheduled;
 161         ctx->scheduled = instr;
 162
 163         ctx->cnt++;
 164 }
 165
 166 /*
 167  * Delay-slot calculation.  Follows fanin/fanout.
 168  */
 169
 170 /* calculate delay for specified src: */
 171 static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 172                 struct ir3_instruction *assigner,
 173                 struct ir3_instruction *consumer, unsigned srcn)
 174 {
 175         unsigned delay = 0;
 176
 177         if (is_meta(assigner)) {
 178                 struct ir3_instruction *src;
 179                 foreach_ssa_src(src, assigner) {
 180                         unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
 181                         delay = MAX2(delay, d);
 182                 }
 183         } else {
 184                 delay = ir3_delayslots(assigner, consumer, srcn);
 185                 delay -= distance(ctx, assigner, delay);
 186         }
 187
 188         return delay;
 189 }
 190
 191 /* calculate delay for instruction (maximum of delay for all srcs): */
 192 static unsigned delay_calc(struct ir3_sched_ctx *ctx,
 193                 struct ir3_instruction *instr)
 194 {
 195         unsigned delay = 0;
 196         struct ir3_instruction *src;
 197
 198         foreach_ssa_src_n(src, i, instr) {
 199                 unsigned d = delay_calc_srcn(ctx, src, instr, i);
 200                 delay = MAX2(delay, d);
 201         }
 202
 203         return delay;
 204 }
 205
 206 /* A negative return value signals that an instruction has been newly
 207  * SCHEDULED (or DELAYED due to address or predicate register already
 208  * in use), return back up to the top of the stack (to block_sched())
 209  */
 210 static int trysched(struct ir3_sched_ctx *ctx,
 211                 struct ir3_instruction *instr)
 212 {
 213         struct ir3_instruction *srcs[64];
 214         struct ir3_instruction *src;
 215         unsigned delay, nsrcs = 0;
 216
 217         /* if already scheduled: */
 218         if (instr->flags & IR3_INSTR_MARK)
 219                 return 0;
 220
 221         /* figure out our src's, copy 'em out into an array for sorting: */
 222         foreach_ssa_src(src, instr) {
 223                 debug_assert(nsrcs < ARRAY_SIZE(srcs));
 224                 srcs[nsrcs++] = src;
 225         }
 226
 227         /* for each src register in sorted order:
 228          */
 229         delay = 0;
 230         while ((src = deepest(srcs, nsrcs))) {
 231                 delay = trysched(ctx, src);
 232                 if (delay)
 233                         return delay;
 234         }
 235
 236         /* all our dependents are scheduled, figure out if
 237          * we have enough delay slots to schedule ourself:
 238          */
 239         delay = delay_calc(ctx, instr);
 240         if (delay)
 241                 return delay;
 242
 243         /* if the instruction is a kill, we need to ensure *every*
 244          * bary.f is scheduled.  The hw seems unhappy if the thread
 245          * gets killed before the end-input (ei) flag is hit.
 246          *
 247          * We could do this by adding each bary.f instruction as
 248          * virtual ssa src for the kill instruction.  But we have
 249          * fixed length instr->regs[].
 250          *
 251          * TODO this wouldn't be quite right if we had multiple
 252          * basic blocks, if any block was conditional.  We'd need
 253          * to schedule the bary.f's outside of any block which
 254          * was conditional that contained a kill.. I think..
 255          */
 256         if (is_kill(instr)) {
 257                 struct ir3 *ir = instr->block->shader;
 258                 unsigned i;
 259
 260                 for (i = 0; i < ir->baryfs_count; i++) {
 261                         struct ir3_instruction *baryf = ir->baryfs[i];
 262                         if (baryf->depth == DEPTH_UNUSED)
 263                                 continue;
 264                         delay = trysched(ctx, baryf);
 265                         if (delay)
 266                                 return delay;
 267                 }
 268         }
 269
 270         /* if this is a write to address/predicate register, and that
 271          * register is currently in use, we need to defer until it is
 272          * free:
 273          */
 274         if (writes_addr(instr) && ctx->addr) {
 275                 assert(ctx->addr != instr);
 276                 return DELAYED;
 277         }
 278         if (writes_pred(instr) && ctx->pred) {
 279                 assert(ctx->pred != instr);
 280                 return DELAYED;
 281         }
 282
 283         schedule(ctx, instr, true);
 284         return SCHEDULED;
 285 }
 286
 287 static struct ir3_instruction * reverse(struct ir3_instruction *instr)
 288 {
 289         struct ir3_instruction *reversed = NULL;
 290         while (instr) {
 291                 struct ir3_instruction *next = instr->next;
 292                 instr->next = reversed;
 293                 reversed = instr;
 294                 instr = next;
 295         }
 296         return reversed;
 297 }
 298
 299 static bool uses_current_addr(struct ir3_sched_ctx *ctx,
 300                 struct ir3_instruction *instr)
 301 {
 302         return instr->address && (ctx->addr == instr->address);
 303 }
 304
 305 static bool uses_current_pred(struct ir3_sched_ctx *ctx,
 306                 struct ir3_instruction *instr)
 307 {
 308         struct ir3_instruction *src;
 309         foreach_ssa_src(src, instr)
 310                 if (ctx->pred == src)
 311                         return true;
 312         return false;
 313 }
 314
 315 /* when we encounter an instruction that writes to the address register
 316  * when it is in use, we delay that instruction and try to schedule all
 317  * other instructions using the current address register:
 318  */
 319 static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
 320                 struct ir3_block *block)
 321 {
 322         struct ir3_instruction *instr = block->head;
 323         bool addr_in_use = false;
 324         bool pred_in_use = false;
 325         bool all_delayed = true;
 326         unsigned cnt = ~0, attempted = 0;
 327
 328         while (instr) {
 329                 struct ir3_instruction *next = instr->next;
 330                 bool addr = uses_current_addr(ctx, instr);
 331                 bool pred = uses_current_pred(ctx, instr);
 332
 333                 if (addr || pred) {
 334                         int ret = trysched(ctx, instr);
 335
 336                         if (ret != DELAYED)
 337                                 all_delayed = false;
 338
 339                         if (ret == SCHEDULED)
 340                                 cnt = 0;
 341                         else if (ret > 0)
 342                                 cnt = MIN2(cnt, ret);
 343                         if (addr)
 344                                 addr_in_use = true;
 345                         if (pred)
 346                                 pred_in_use = true;
 347
 348                         attempted++;
 349                 }
 350
 351                 instr = next;
 352         }
 353
 354         if (!addr_in_use)
 355                 ctx->addr = NULL;
 356
 357         if (!pred_in_use)
 358                 ctx->pred = NULL;
 359
 360         /* detect if we've gotten ourselves into an impossible situation
 361          * and bail if needed
 362          */
 363         if (all_delayed && (attempted > 0)) {
 364                 if (pred_in_use) {
 365                         /* TODO we probably need to keep a list of instructions
 366                          * that reference predicate, similar to indirects
 367                          */
 368                         ctx->error = true;
 369                         return DELAYED;
 370                 }
 371                 if (addr_in_use) {
 372                         struct ir3 *ir = ctx->addr->block->shader;
 373                         struct ir3_instruction *new_addr =
 374                                         ir3_instr_clone(ctx->addr);
 375                         unsigned i;
 376
 377                         /* original addr is scheduled, but new one isn't: */
 378                         new_addr->flags &= ~IR3_INSTR_MARK;
 379
 380                         for (i = 0; i < ir->indirects_count; i++) {
 381                                 struct ir3_instruction *indirect = ir->indirects[i];
 382
 383                                 /* skip instructions already scheduled: */
 384                                 if (indirect->flags & IR3_INSTR_MARK)
 385                                         continue;
 386
 387                                 /* remap remaining instructions using current addr
 388                                  * to new addr:
 389                                  */
 390                                 if (indirect->address == ctx->addr)
 391                                         indirect->address = new_addr;
 392                         }
 393
 394                         /* all remaining indirects remapped to new addr: */
 395                         ctx->addr = NULL;
 396
 397                         /* not really, but this will trigger us to go back to
 398                          * main trysched() loop now that we've resolved the
 399                          * conflict by duplicating the instr that writes to
 400                          * the address register.
 401                          */
 402                         return SCHEDULED;
 403                 }
 404         }
 405
 406         return cnt;
 407 }
 408
 409 static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 410 {
 411         struct ir3_instruction *instr;
 412
 413         /* schedule all the shader input's (meta-instr) first so that
 414          * the RA step sees that the input registers contain a value
 415          * from the start of the shader:
 416          */
 417         if (!block->parent) {
 418                 unsigned i;
 419                 for (i = 0; i < block->ninputs; i++) {
 420                         struct ir3_instruction *in = block->inputs[i];
 421                         if (in)
 422                                 schedule(ctx, in, true);
 423                 }
 424         }
 425
 426         while ((instr = block->head) && !ctx->error) {
 427                 /* NOTE: always grab next *before* trysched(), in case the
 428                  * instruction is actually scheduled (and therefore moved
 429                  * from depth list into scheduled list)
 430                  */
 431                 struct ir3_instruction *next = instr->next;
 432                 int cnt = trysched(ctx, instr);
 433
 434                 if (cnt == DELAYED)
 435                         cnt = block_sched_undelayed(ctx, block);
 436
 437                 /* -1 is signal to return up stack, but to us means same as 0: */
 438                 cnt = MAX2(0, cnt);
 439                 cnt += ctx->cnt;
 440                 instr = next;
 441
 442                 /* if deepest remaining instruction cannot be scheduled, try
 443                  * the increasingly more shallow instructions until needed
 444                  * number of delay slots is filled:
 445                  */
 446                 while (instr && (cnt > ctx->cnt)) {
 447                         next = instr->next;
 448                         trysched(ctx, instr);
 449                         instr = next;
 450                 }
 451
 452                 /* and if we run out of instructions that can be scheduled,
 453                  * then it is time for nop's:
 454                  */
 455                 while (cnt > ctx->cnt)
 456                         schedule(ctx, ir3_NOP(block), false);
 457         }
 458
 459         /* at this point, scheduled list is in reverse order, so fix that: */
 460         block->head = reverse(ctx->scheduled);
 461 }
 462
 463 int ir3_block_sched(struct ir3_block *block)
 464 {
 465         struct ir3_sched_ctx ctx = {0};
 466         ir3_clear_mark(block->shader);
 467         block_sched(&ctx, block);
 468         if (ctx.error)
 469                 return -1;
 470         return 0;
 471 }