src/gallium/drivers/freedreno/a3xx/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 enum {
  35         SCHEDULED = -1,
  36         DELAYED = -2,
  37 };
  38
  39 /*
  40  * Instruction Scheduling:
  41  *
  42  * Using the depth sorted list from depth pass, attempt to recursively
  43  * schedule deepest unscheduled path.  The first instruction that cannot
  44  * be scheduled, returns the required delay slots it needs, at which
  45  * point we return back up to the top and attempt to schedule by next
  46  * highest depth.  After a sufficient number of instructions have been
  47  * scheduled, return back to beginning of list and start again.  If you
  48  * reach the end of depth sorted list without being able to insert any
  49  * instruction, insert nop's.  Repeat until no more unscheduled
  50  * instructions.
  51  *
  52  * There are a few special cases that need to be handled, since sched
  53  * is currently independent of register allocation.  Usages of address
  54  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  55  * if you have two pairs of instructions that write the same special
  56  * register and then read it, then those pairs cannot be interleaved.
  57  * To solve this, when we are in such a scheduling "critical section",
  58  * and we encounter a conflicting write to a special register, we try
  59  * to schedule any remaining instructions that use that value first.
  60  */
  61
  62 struct ir3_sched_ctx {
  63         struct ir3_instruction *scheduled; /* last scheduled instr */
  64         struct ir3_instruction *addr;      /* current a0.x user, if any */
  65         struct ir3_instruction *pred;      /* current p0.x user, if any */
  66         unsigned cnt;
  67 };
  68
  69 static struct ir3_instruction *
  70 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
  71 {
  72         struct ir3_instruction *d = NULL;
  73         unsigned i = 0, id = 0;
  74
  75         while ((i < nsrcs) && !(d = srcs[id = i]))
  76                 i++;
  77
  78         if (!d)
  79                 return NULL;
  80
  81         for (; i < nsrcs; i++)
  82                 if (srcs[i] && (srcs[i]->depth > d->depth))
  83                         d = srcs[id = i];
  84
  85         srcs[id] = NULL;
  86
  87         return d;
  88 }
  89
  90 static unsigned distance(struct ir3_sched_ctx *ctx,
  91                 struct ir3_instruction *instr, unsigned maxd)
  92 {
  93         struct ir3_instruction *n = ctx->scheduled;
  94         unsigned d = 0;
  95         while (n && (n != instr) && (d < maxd)) {
  96                 if (is_alu(n) || is_flow(n))
  97                         d++;
  98                 n = n->next;
  99         }
 100         return d;
 101 }
 102
 103 /* TODO maybe we want double linked list? */
 104 static struct ir3_instruction * prev(struct ir3_instruction *instr)
 105 {
 106         struct ir3_instruction *p = instr->block->head;
 107         while (p && (p->next != instr))
 108                 p = p->next;
 109         return p;
 110 }
 111
 112 static void schedule(struct ir3_sched_ctx *ctx,
 113                 struct ir3_instruction *instr, bool remove)
 114 {
 115         struct ir3_block *block = instr->block;
 116
 117         /* maybe there is a better way to handle this than just stuffing
 118          * a nop.. ideally we'd know about this constraint in the
 119          * scheduling and depth calculation..
 120          */
 121         if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
 122                 schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
 123
 124         /* remove from depth list:
 125          */
 126         if (remove) {
 127                 struct ir3_instruction *p = prev(instr);
 128
 129                 /* NOTE: this can happen for inputs which are not
 130                  * read.. in that case there is no need to schedule
 131                  * the input, so just bail:
 132                  */
 133                 if (instr != (p ? p->next : block->head))
 134                         return;
 135
 136                 if (p)
 137                         p->next = instr->next;
 138                 else
 139                         block->head = instr->next;
 140         }
 141
 142         if (writes_addr(instr)) {
 143                 assert(ctx->addr == NULL);
 144                 ctx->addr = instr;
 145         }
 146
 147         if (writes_pred(instr)) {
 148                 assert(ctx->pred == NULL);
 149                 ctx->pred = instr;
 150         }
 151
 152         instr->flags |= IR3_INSTR_MARK;
 153
 154         instr->next = ctx->scheduled;
 155         ctx->scheduled = instr;
 156
 157         ctx->cnt++;
 158 }
 159
 160 /*
 161  * Delay-slot calculation.  Follows fanin/fanout.
 162  */
 163
 164 static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
 165                 struct ir3_instruction *assigner,
 166                 struct ir3_instruction *consumer, unsigned srcn)
 167 {
 168         unsigned delay = 0;
 169
 170         if (is_meta(assigner)) {
 171                 unsigned i;
 172                 for (i = 1; i < assigner->regs_count; i++) {
 173                         struct ir3_register *reg = assigner->regs[i];
 174                         if (reg->flags & IR3_REG_SSA) {
 175                                 unsigned d = delay_calc2(ctx, reg->instr,
 176                                                 consumer, srcn);
 177                                 delay = MAX2(delay, d);
 178                         }
 179                 }
 180         } else {
 181                 delay = ir3_delayslots(assigner, consumer, srcn);
 182                 delay -= distance(ctx, assigner, delay);
 183         }
 184
 185         return delay;
 186 }
 187
 188 static unsigned delay_calc(struct ir3_sched_ctx *ctx,
 189                 struct ir3_instruction *instr)
 190 {
 191         unsigned i, delay = 0;
 192
 193         for (i = 1; i < instr->regs_count; i++) {
 194                 struct ir3_register *reg = instr->regs[i];
 195                 if (reg->flags & IR3_REG_SSA) {
 196                         unsigned d = delay_calc2(ctx, reg->instr,
 197                                         instr, i - 1);
 198                         delay = MAX2(delay, d);
 199                 }
 200         }
 201
 202         return delay;
 203 }
 204
 205 /* A negative return value signals that an instruction has been newly
 206  * scheduled, return back up to the top of the stack (to block_sched())
 207  */
 208 static int trysched(struct ir3_sched_ctx *ctx,
 209                 struct ir3_instruction *instr)
 210 {
 211         struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
 212         struct ir3_instruction *src;
 213         unsigned i, delay, nsrcs = 0;
 214
 215         /* if already scheduled: */
 216         if (instr->flags & IR3_INSTR_MARK)
 217                 return 0;
 218
 219         /* figure out our src's: */
 220         for (i = 1; i < instr->regs_count; i++) {
 221                 struct ir3_register *reg = instr->regs[i];
 222                 if (reg->flags & IR3_REG_SSA)
 223                         srcs[nsrcs++] = reg->instr;
 224         }
 225
 226         /* for each src register in sorted order:
 227          */
 228         delay = 0;
 229         while ((src = deepest(srcs, nsrcs))) {
 230                 delay = trysched(ctx, src);
 231                 if (delay)
 232                         return delay;
 233         }
 234
 235         /* all our dependents are scheduled, figure out if
 236          * we have enough delay slots to schedule ourself:
 237          */
 238         delay = delay_calc(ctx, instr);
 239         if (delay)
 240                 return delay;
 241
 242         /* if this is a write to address/predicate register, and that
 243          * register is currently in use, we need to defer until it is
 244          * free:
 245          */
 246         if (writes_addr(instr) && ctx->addr) {
 247                 assert(ctx->addr != instr);
 248                 return DELAYED;
 249         }
 250         if (writes_pred(instr) && ctx->pred) {
 251                 assert(ctx->pred != instr);
 252                 return DELAYED;
 253         }
 254
 255         schedule(ctx, instr, true);
 256         return SCHEDULED;
 257 }
 258
 259 static struct ir3_instruction * reverse(struct ir3_instruction *instr)
 260 {
 261         struct ir3_instruction *reversed = NULL;
 262         while (instr) {
 263                 struct ir3_instruction *next = instr->next;
 264                 instr->next = reversed;
 265                 reversed = instr;
 266                 instr = next;
 267         }
 268         return reversed;
 269 }
 270
 271 static bool uses_current_addr(struct ir3_sched_ctx *ctx,
 272                 struct ir3_instruction *instr)
 273 {
 274         unsigned i;
 275         for (i = 1; i < instr->regs_count; i++) {
 276                 struct ir3_register *reg = instr->regs[i];
 277                 if (reg->flags & IR3_REG_SSA) {
 278                         if (is_addr(reg->instr)) {
 279                                 struct ir3_instruction *addr;
 280                                 addr = reg->instr->regs[1]->instr; /* the mova */
 281                                 if (ctx->addr == addr)
 282                                         return true;
 283                         }
 284                 }
 285         }
 286         return false;
 287 }
 288
 289 static bool uses_current_pred(struct ir3_sched_ctx *ctx,
 290                 struct ir3_instruction *instr)
 291 {
 292         unsigned i;
 293         for (i = 1; i < instr->regs_count; i++) {
 294                 struct ir3_register *reg = instr->regs[i];
 295                 if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
 296                                 return true;
 297         }
 298         return false;
 299 }
 300
 301 /* when we encounter an instruction that writes to the address register
 302  * when it is in use, we delay that instruction and try to schedule all
 303  * other instructions using the current address register:
 304  */
 305 static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
 306                 struct ir3_block *block)
 307 {
 308         struct ir3_instruction *instr = block->head;
 309         bool addr_in_use = false;
 310         bool pred_in_use = false;
 311         unsigned cnt = ~0;
 312
 313         while (instr) {
 314                 struct ir3_instruction *next = instr->next;
 315                 bool addr = uses_current_addr(ctx, instr);
 316                 bool pred = uses_current_pred(ctx, instr);
 317
 318                 if (addr || pred) {
 319                         int ret = trysched(ctx, instr);
 320                         if (ret == SCHEDULED)
 321                                 cnt = 0;
 322                         else if (ret > 0)
 323                                 cnt = MIN2(cnt, ret);
 324                         if (addr)
 325                                 addr_in_use = true;
 326                         if (pred)
 327                                 pred_in_use = true;
 328                 }
 329
 330                 instr = next;
 331         }
 332
 333         if (!addr_in_use)
 334                 ctx->addr = NULL;
 335
 336         if (!pred_in_use)
 337                 ctx->pred = NULL;
 338
 339         return cnt;
 340 }
 341
 342 static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 343 {
 344         struct ir3_instruction *instr;
 345
 346         /* schedule all the shader input's (meta-instr) first so that
 347          * the RA step sees that the input registers contain a value
 348          * from the start of the shader:
 349          */
 350         if (!block->parent) {
 351                 unsigned i;
 352                 for (i = 0; i < block->ninputs; i++) {
 353                         struct ir3_instruction *in = block->inputs[i];
 354                         if (in)
 355                                 schedule(ctx, in, true);
 356                 }
 357         }
 358
 359         while ((instr = block->head)) {
 360                 /* NOTE: always grab next *before* trysched(), in case the
 361                  * instruction is actually scheduled (and therefore moved
 362                  * from depth list into scheduled list)
 363                  */
 364                 struct ir3_instruction *next = instr->next;
 365                 int cnt = trysched(ctx, instr);
 366
 367                 if (cnt == DELAYED)
 368                         cnt = block_sched_undelayed(ctx, block);
 369
 370                 /* -1 is signal to return up stack, but to us means same as 0: */
 371                 cnt = MAX2(0, cnt);
 372                 cnt += ctx->cnt;
 373                 instr = next;
 374
 375                 /* if deepest remaining instruction cannot be scheduled, try
 376                  * the increasingly more shallow instructions until needed
 377                  * number of delay slots is filled:
 378                  */
 379                 while (instr && (cnt > ctx->cnt)) {
 380                         next = instr->next;
 381                         trysched(ctx, instr);
 382                         instr = next;
 383                 }
 384
 385                 /* and if we run out of instructions that can be scheduled,
 386                  * then it is time for nop's:
 387                  */
 388                 while (cnt > ctx->cnt)
 389                         schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
 390         }
 391
 392         /* at this point, scheduled list is in reverse order, so fix that: */
 393         block->head = reverse(ctx->scheduled);
 394 }
 395
 396 void ir3_block_sched(struct ir3_block *block)
 397 {
 398         struct ir3_sched_ctx ctx = {0};
 399         ir3_clear_mark(block->shader);
 400         block_sched(&ctx, block);
 401 }