src/freedreno/ir3/ir3_delay.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27 #include "ir3.h"
  28
  29 /*
  30  * Helpers to figure out the necessary delay slots between instructions.  Used
  31  * both in scheduling pass(es) and the final pass to insert any required nop's
  32  * so that the shader program is valid.
  33  *
  34  * Note that this needs to work both pre and post RA, so we can't assume ssa
  35  * src iterators work.
  36  */
  37
  38 /* generally don't count false dependencies, since this can just be
  39  * something like a barrier, or SSBO store.  The exception is array
  40  * dependencies if the assigner is an array write and the consumer
  41  * reads the same array.
  42  */
  43 static bool
  44 ignore_dep(struct ir3_instruction *assigner,
  45                 struct ir3_instruction *consumer, unsigned n)
  46 {
  47         if (!__is_false_dep(consumer, n))
  48                 return false;
  49
  50         if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
  51                 struct ir3_register *dst = assigner->regs[0];
  52                 struct ir3_register *src;
  53
  54                 debug_assert(dst->flags & IR3_REG_ARRAY);
  55
  56                 foreach_src (src, consumer) {
  57                         if ((src->flags & IR3_REG_ARRAY) &&
  58                                         (dst->array.id == src->array.id)) {
  59                                 return false;
  60                         }
  61                 }
  62         }
  63
  64         return true;
  65 }
  66
  67 /* calculate required # of delay slots between the instruction that
  68  * assigns a value and the one that consumes
  69  */
  70 int
  71 ir3_delayslots(struct ir3_instruction *assigner,
  72                 struct ir3_instruction *consumer, unsigned n, bool soft)
  73 {
  74         if (ignore_dep(assigner, consumer, n))
  75                 return 0;
  76
  77         /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
  78          * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
  79          * handled with sync bits
  80          */
  81
  82         if (is_meta(assigner) || is_meta(consumer))
  83                 return 0;
  84
  85         if (writes_addr(assigner))
  86                 return 6;
  87
  88         /* On a6xx, it takes the number of delay slots to get a SFU result
  89          * back (ie. using nop's instead of (ss) is:
  90          *
  91          *     8 - single warp
  92          *     9 - two warps
  93          *    10 - four warps
  94          *
  95          * and so on.  Not quite sure where it tapers out (ie. how many
  96          * warps share an SFU unit).  But 10 seems like a reasonable #
  97          * to choose:
  98          */
  99         if (soft && is_sfu(assigner))
 100                 return 10;
 101
 102         /* handled via sync flags: */
 103         if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 104                 return 0;
 105
 106         /* assigner must be alu: */
 107         if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 108                         is_mem(consumer)) {
 109                 return 6;
 110         } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 111                         (n == 3)) {
 112                 /* special case, 3rd src to cat3 not required on first cycle */
 113                 return 1;
 114         } else {
 115                 return 3;
 116         }
 117 }
 118
 119 static bool
 120 count_instruction(struct ir3_instruction *n)
 121 {
 122         /* NOTE: don't count branch/jump since we don't know yet if they will
 123          * be eliminated later in resolve_jumps().. really should do that
 124          * earlier so we don't have this constraint.
 125          */
 126         return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
 127 }
 128
 129 /**
 130  * @block: the block to search in, starting from end; in first pass,
 131  *    this will be the block the instruction would be inserted into
 132  *    (but has not yet, ie. it only contains already scheduled
 133  *    instructions).  For intra-block scheduling (second pass), this
 134  *    would be one of the predecessor blocks.
 135  * @instr: the instruction to search for
 136  * @maxd:  max distance, bail after searching this # of instruction
 137  *    slots, since it means the instruction we are looking for is
 138  *    far enough away
 139  * @pred:  if true, recursively search into predecessor blocks to
 140  *    find the worst case (shortest) distance (only possible after
 141  *    individual blocks are all scheduled)
 142  */
 143 static unsigned
 144 distance(struct ir3_block *block, struct ir3_instruction *instr,
 145                 unsigned maxd, bool pred)
 146 {
 147         unsigned d = 0;
 148
 149         /* Note that this relies on incrementally building up the block's
 150          * instruction list.. but this is how scheduling and nopsched
 151          * work.
 152          */
 153         foreach_instr_rev (n, &block->instr_list) {
 154                 if ((n == instr) || (d >= maxd))
 155                         return MIN2(maxd, d + n->nop);
 156                 if (count_instruction(n))
 157                         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 158         }
 159
 160         /* if coming from a predecessor block, assume it is assigned far
 161          * enough away.. we'll fix up later.
 162          */
 163         if (!pred)
 164                 return maxd;
 165
 166         if (pred && (block->data != block)) {
 167                 /* Search into predecessor blocks, finding the one with the
 168                  * shortest distance, since that will be the worst case
 169                  */
 170                 unsigned min = maxd - d;
 171
 172                 /* (ab)use block->data to prevent recursion: */
 173                 block->data = block;
 174
 175                 set_foreach (block->predecessors, entry) {
 176                         struct ir3_block *pred = (struct ir3_block *)entry->key;
 177                         unsigned n;
 178
 179                         n = distance(pred, instr, min, pred);
 180
 181                         min = MIN2(min, n);
 182                 }
 183
 184                 block->data = NULL;
 185                 d += min;
 186         }
 187
 188         return d;
 189 }
 190
 191 /* calculate delay for specified src: */
 192 static unsigned
 193 delay_calc_srcn(struct ir3_block *block,
 194                 struct ir3_instruction *assigner,
 195                 struct ir3_instruction *consumer,
 196                 unsigned srcn, bool soft, bool pred)
 197 {
 198         unsigned delay = 0;
 199
 200         if (is_meta(assigner)) {
 201                 struct ir3_register *src;
 202                 foreach_src (src, assigner) {
 203                         unsigned d;
 204
 205                         if (!src->instr)
 206                                 continue;
 207
 208                         d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
 209                         delay = MAX2(delay, d);
 210                 }
 211         } else {
 212                 delay = ir3_delayslots(assigner, consumer, srcn, soft);
 213                 delay -= distance(block, assigner, delay, pred);
 214         }
 215
 216         return delay;
 217 }
 218
 219 static struct ir3_instruction *
 220 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
 221 {
 222         unsigned d = 0;
 223
 224         /* Note that this relies on incrementally building up the block's
 225          * instruction list.. but this is how scheduling and nopsched
 226          * work.
 227          */
 228         foreach_instr_rev (n, &block->instr_list) {
 229                 if (d >= maxd)
 230                         return NULL;
 231                 if (count_instruction(n))
 232                         d++;
 233                 if (dest_regs(n) == 0)
 234                         continue;
 235
 236                 /* note that a dest reg will never be an immediate */
 237                 if (n->regs[0]->array.id == array_id)
 238                         return n;
 239         }
 240
 241         return NULL;
 242 }
 243
 244 /* like list_length() but only counts instructions which count in the
 245  * delay determination:
 246  */
 247 static unsigned
 248 count_block_delay(struct ir3_block *block)
 249 {
 250         unsigned delay = 0;
 251         foreach_instr (n, &block->instr_list) {
 252                 if (!count_instruction(n))
 253                         continue;
 254                 delay++;
 255         }
 256         return delay;
 257 }
 258
 259 static unsigned
 260 delay_calc_array(struct ir3_block *block, unsigned array_id,
 261                 struct ir3_instruction *consumer, unsigned srcn,
 262                 bool soft, bool pred, unsigned maxd)
 263 {
 264         struct ir3_instruction *assigner;
 265
 266         assigner = find_array_write(block, array_id, maxd);
 267         if (assigner)
 268                 return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
 269
 270         if (!pred)
 271                 return 0;
 272
 273         unsigned len = count_block_delay(block);
 274         if (maxd <= len)
 275                 return 0;
 276
 277         maxd -= len;
 278
 279         if (block->data == block) {
 280                 /* we have a loop, return worst case: */
 281                 return maxd;
 282         }
 283
 284         /* If we need to search into predecessors, find the one with the
 285          * max delay.. the resulting delay is that minus the number of
 286          * counted instructions in this block:
 287          */
 288         unsigned max = 0;
 289
 290         /* (ab)use block->data to prevent recursion: */
 291         block->data = block;
 292
 293         set_foreach (block->predecessors, entry) {
 294                 struct ir3_block *pred = (struct ir3_block *)entry->key;
 295                 unsigned delay =
 296                         delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
 297
 298                 max = MAX2(max, delay);
 299         }
 300
 301         block->data = NULL;
 302
 303         if (max < len)
 304                 return 0;
 305
 306         return max - len;
 307 }
 308
 309 /**
 310  * Calculate delay for instruction (maximum of delay for all srcs):
 311  *
 312  * @soft:  If true, add additional delay for situations where they
 313  *    would not be strictly required because a sync flag would be
 314  *    used (but scheduler would prefer to schedule some other
 315  *    instructions first to avoid stalling on sync flag)
 316  * @pred:  If true, recurse into predecessor blocks
 317  */
 318 unsigned
 319 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 320                 bool soft, bool pred)
 321 {
 322         unsigned delay = 0;
 323         struct ir3_register *src;
 324
 325         foreach_src_n (src, i, instr) {
 326                 unsigned d = 0;
 327
 328                 if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
 329                         d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
 330                 } else if (src->instr) {
 331                         d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
 332                 }
 333
 334                 delay = MAX2(delay, d);
 335         }
 336
 337         if (instr->address) {
 338                 unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
 339                 delay = MAX2(delay, d);
 340         }
 341
 342         return delay;
 343 }
 344
 345 /**
 346  * Remove nop instructions.  The scheduler can insert placeholder nop's
 347  * so that ir3_delay_calc() can account for nop's that won't be needed
 348  * due to nop's triggered by a previous instruction.  However, before
 349  * legalize, we want to remove these.  The legalize pass can insert
 350  * some nop's if needed to hold (for example) sync flags.  This final
 351  * remaining nops are inserted by legalize after this.
 352  */
 353 void
 354 ir3_remove_nops(struct ir3 *ir)
 355 {
 356         foreach_block (block, &ir->block_list) {
 357                 foreach_instr_safe (instr, &block->instr_list) {
 358                         if (instr->opc == OPC_NOP) {
 359                                 list_del(&instr->node);
 360                         }
 361                 }
 362         }
 363
 364 }