src/freedreno/ir3/ir3_delay.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27 #include "ir3.h"
  28
  29 /*
  30  * Helpers to figure out the necessary delay slots between instructions.  Used
  31  * both in scheduling pass(es) and the final pass to insert any required nop's
  32  * so that the shader program is valid.
  33  *
  34  * Note that this needs to work both pre and post RA, so we can't assume ssa
  35  * src iterators work.
  36  */
  37
  38 /* generally don't count false dependencies, since this can just be
  39  * something like a barrier, or SSBO store.  The exception is array
  40  * dependencies if the assigner is an array write and the consumer
  41  * reads the same array.
  42  */
  43 static bool
  44 ignore_dep(struct ir3_instruction *assigner,
  45                 struct ir3_instruction *consumer, unsigned n)
  46 {
  47         if (!__is_false_dep(consumer, n))
  48                 return false;
  49
  50         if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
  51                 struct ir3_register *dst = assigner->regs[0];
  52
  53                 debug_assert(dst->flags & IR3_REG_ARRAY);
  54
  55                 foreach_src (src, consumer) {
  56                         if ((src->flags & IR3_REG_ARRAY) &&
  57                                         (dst->array.id == src->array.id)) {
  58                                 return false;
  59                         }
  60                 }
  61         }
  62
  63         return true;
  64 }
  65
  66 /* calculate required # of delay slots between the instruction that
  67  * assigns a value and the one that consumes
  68  */
  69 int
  70 ir3_delayslots(struct ir3_instruction *assigner,
  71                 struct ir3_instruction *consumer, unsigned n, bool soft)
  72 {
  73         if (ignore_dep(assigner, consumer, n))
  74                 return 0;
  75
  76         /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
  77          * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
  78          * handled with sync bits
  79          */
  80
  81         if (is_meta(assigner) || is_meta(consumer))
  82                 return 0;
  83
  84         if (writes_addr0(assigner) || writes_addr1(assigner))
  85                 return 6;
  86
  87         /* On a6xx, it takes the number of delay slots to get a SFU result
  88          * back (ie. using nop's instead of (ss) is:
  89          *
  90          *     8 - single warp
  91          *     9 - two warps
  92          *    10 - four warps
  93          *
  94          * and so on.  Not quite sure where it tapers out (ie. how many
  95          * warps share an SFU unit).  But 10 seems like a reasonable #
  96          * to choose:
  97          */
  98         if (soft && is_sfu(assigner))
  99                 return 10;
 100
 101         /* handled via sync flags: */
 102         if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 103                 return 0;
 104
 105         /* assigner must be alu: */
 106         if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 107                         is_mem(consumer)) {
 108                 return 6;
 109         } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 110                         (n == 3)) {
 111                 /* special case, 3rd src to cat3 not required on first cycle */
 112                 return 1;
 113         } else {
 114                 return 3;
 115         }
 116 }
 117
 118 static bool
 119 count_instruction(struct ir3_instruction *n)
 120 {
 121         /* NOTE: don't count branch/jump since we don't know yet if they will
 122          * be eliminated later in resolve_jumps().. really should do that
 123          * earlier so we don't have this constraint.
 124          */
 125         return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 126 }
 127
 128 /**
 129  * @block: the block to search in, starting from end; in first pass,
 130  *    this will be the block the instruction would be inserted into
 131  *    (but has not yet, ie. it only contains already scheduled
 132  *    instructions).  For intra-block scheduling (second pass), this
 133  *    would be one of the predecessor blocks.
 134  * @instr: the instruction to search for
 135  * @maxd:  max distance, bail after searching this # of instruction
 136  *    slots, since it means the instruction we are looking for is
 137  *    far enough away
 138  * @pred:  if true, recursively search into predecessor blocks to
 139  *    find the worst case (shortest) distance (only possible after
 140  *    individual blocks are all scheduled)
 141  */
 142 static unsigned
 143 distance(struct ir3_block *block, struct ir3_instruction *instr,
 144                 unsigned maxd, bool pred)
 145 {
 146         unsigned d = 0;
 147
 148         /* Note that this relies on incrementally building up the block's
 149          * instruction list.. but this is how scheduling and nopsched
 150          * work.
 151          */
 152         foreach_instr_rev (n, &block->instr_list) {
 153                 if ((n == instr) || (d >= maxd))
 154                         return MIN2(maxd, d + n->nop);
 155                 if (count_instruction(n))
 156                         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 157         }
 158
 159         /* if coming from a predecessor block, assume it is assigned far
 160          * enough away.. we'll fix up later.
 161          */
 162         if (!pred)
 163                 return maxd;
 164
 165         if (pred && (block->data != block)) {
 166                 /* Search into predecessor blocks, finding the one with the
 167                  * shortest distance, since that will be the worst case
 168                  */
 169                 unsigned min = maxd - d;
 170
 171                 /* (ab)use block->data to prevent recursion: */
 172                 block->data = block;
 173
 174                 set_foreach (block->predecessors, entry) {
 175                         struct ir3_block *pred = (struct ir3_block *)entry->key;
 176                         unsigned n;
 177
 178                         n = distance(pred, instr, min, pred);
 179
 180                         min = MIN2(min, n);
 181                 }
 182
 183                 block->data = NULL;
 184                 d += min;
 185         }
 186
 187         return d;
 188 }
 189
 190 /* calculate delay for specified src: */
 191 static unsigned
 192 delay_calc_srcn(struct ir3_block *block,
 193                 struct ir3_instruction *assigner,
 194                 struct ir3_instruction *consumer,
 195                 unsigned srcn, bool soft, bool pred)
 196 {
 197         unsigned delay = 0;
 198
 199         if (is_meta(assigner)) {
 200                 foreach_src_n (src, n, assigner) {
 201                         unsigned d;
 202
 203                         if (!src->instr)
 204                                 continue;
 205
 206                         d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
 207
 208                         /* A (rptN) instruction executes in consecutive cycles so
 209                          * it's outputs are written in successive cycles.  And
 210                          * likewise for it's (r)'d (incremented) inputs, they are
 211                          * read on successive cycles.
 212                          *
 213                          * So we need to adjust the delay for (rptN)'s assigners
 214                          * and consumers accordingly.
 215                          *
 216                          * Note that the dst of a (rptN) instruction is implicitly
 217                          * (r) (the assigner case), although that is not the case
 218                          * for src registers.  There is exactly one case, bary.f,
 219                          * which has a vecN (collect) src that is not (r)'d.
 220                          */
 221                         if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
 222                                 /* (rptN) assigner case: */
 223                                 d -= MIN2(d, src->instr->repeat - assigner->split.off);
 224                         } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
 225                                         (consumer->regs[srcn]->flags & IR3_REG_R)) {
 226                                 d -= MIN2(d, n);
 227                         }
 228
 229                         delay = MAX2(delay, d);
 230                 }
 231         } else {
 232                 delay = ir3_delayslots(assigner, consumer, srcn, soft);
 233                 delay -= distance(block, assigner, delay, pred);
 234         }
 235
 236         return delay;
 237 }
 238
 239 static struct ir3_instruction *
 240 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
 241 {
 242         unsigned d = 0;
 243
 244         /* Note that this relies on incrementally building up the block's
 245          * instruction list.. but this is how scheduling and nopsched
 246          * work.
 247          */
 248         foreach_instr_rev (n, &block->instr_list) {
 249                 if (d >= maxd)
 250                         return NULL;
 251                 if (count_instruction(n))
 252                         d++;
 253                 if (dest_regs(n) == 0)
 254                         continue;
 255
 256                 /* note that a dest reg will never be an immediate */
 257                 if (n->regs[0]->array.id == array_id)
 258                         return n;
 259         }
 260
 261         return NULL;
 262 }
 263
 264 /* like list_length() but only counts instructions which count in the
 265  * delay determination:
 266  */
 267 static unsigned
 268 count_block_delay(struct ir3_block *block)
 269 {
 270         unsigned delay = 0;
 271         foreach_instr (n, &block->instr_list) {
 272                 if (!count_instruction(n))
 273                         continue;
 274                 delay++;
 275         }
 276         return delay;
 277 }
 278
 279 static unsigned
 280 delay_calc_array(struct ir3_block *block, unsigned array_id,
 281                 struct ir3_instruction *consumer, unsigned srcn,
 282                 bool soft, bool pred, unsigned maxd)
 283 {
 284         struct ir3_instruction *assigner;
 285
 286         assigner = find_array_write(block, array_id, maxd);
 287         if (assigner)
 288                 return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
 289
 290         if (!pred)
 291                 return 0;
 292
 293         unsigned len = count_block_delay(block);
 294         if (maxd <= len)
 295                 return 0;
 296
 297         maxd -= len;
 298
 299         if (block->data == block) {
 300                 /* we have a loop, return worst case: */
 301                 return maxd;
 302         }
 303
 304         /* If we need to search into predecessors, find the one with the
 305          * max delay.. the resulting delay is that minus the number of
 306          * counted instructions in this block:
 307          */
 308         unsigned max = 0;
 309
 310         /* (ab)use block->data to prevent recursion: */
 311         block->data = block;
 312
 313         set_foreach (block->predecessors, entry) {
 314                 struct ir3_block *pred = (struct ir3_block *)entry->key;
 315                 unsigned delay =
 316                         delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
 317
 318                 max = MAX2(max, delay);
 319         }
 320
 321         block->data = NULL;
 322
 323         if (max < len)
 324                 return 0;
 325
 326         return max - len;
 327 }
 328
 329 /**
 330  * Calculate delay for instruction (maximum of delay for all srcs):
 331  *
 332  * @soft:  If true, add additional delay for situations where they
 333  *    would not be strictly required because a sync flag would be
 334  *    used (but scheduler would prefer to schedule some other
 335  *    instructions first to avoid stalling on sync flag)
 336  * @pred:  If true, recurse into predecessor blocks
 337  */
 338 unsigned
 339 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 340                 bool soft, bool pred)
 341 {
 342         unsigned delay = 0;
 343
 344         foreach_src_n (src, i, instr) {
 345                 unsigned d = 0;
 346
 347                 if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
 348                         d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
 349                 } else if (src->instr) {
 350                         d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
 351                 }
 352
 353                 delay = MAX2(delay, d);
 354         }
 355
 356         if (instr->address) {
 357                 unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
 358                 delay = MAX2(delay, d);
 359         }
 360
 361         return delay;
 362 }
 363
 364 /**
 365  * Remove nop instructions.  The scheduler can insert placeholder nop's
 366  * so that ir3_delay_calc() can account for nop's that won't be needed
 367  * due to nop's triggered by a previous instruction.  However, before
 368  * legalize, we want to remove these.  The legalize pass can insert
 369  * some nop's if needed to hold (for example) sync flags.  This final
 370  * remaining nops are inserted by legalize after this.
 371  */
 372 void
 373 ir3_remove_nops(struct ir3 *ir)
 374 {
 375         foreach_block (block, &ir->block_list) {
 376                 foreach_instr_safe (instr, &block->instr_list) {
 377                         if (instr->opc == OPC_NOP) {
 378                                 list_del(&instr->node);
 379                         }
 380                 }
 381         }
 382
 383 }