src/freedreno/ir3/ir3_delay.c

   1 /*
   2  * Copyright (C) 2019 Google, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  *
  23  * Authors:
  24  *    Rob Clark <robclark@freedesktop.org>
  25  */
  26
  27 #include "ir3.h"
  28
  29 /*
  30  * Helpers to figure out the necessary delay slots between instructions.  Used
  31  * both in scheduling pass(es) and the final pass to insert any required nop's
  32  * so that the shader program is valid.
  33  *
  34  * Note that this needs to work both pre and post RA, so we can't assume ssa
  35  * src iterators work.
  36  */
  37
  38 /* generally don't count false dependencies, since this can just be
  39  * something like a barrier, or SSBO store.  The exception is array
  40  * dependencies if the assigner is an array write and the consumer
  41  * reads the same array.
  42  */
  43 static bool
  44 ignore_dep(struct ir3_instruction *assigner,
  45                 struct ir3_instruction *consumer, unsigned n)
  46 {
  47         if (!__is_false_dep(consumer, n))
  48                 return false;
  49
  50         if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
  51                 struct ir3_register *dst = assigner->regs[0];
  52
  53                 debug_assert(dst->flags & IR3_REG_ARRAY);
  54
  55                 foreach_src (src, consumer) {
  56                         if ((src->flags & IR3_REG_ARRAY) &&
  57                                         (dst->array.id == src->array.id)) {
  58                                 return false;
  59                         }
  60                 }
  61         }
  62
  63         return true;
  64 }
  65
  66 /* calculate required # of delay slots between the instruction that
  67  * assigns a value and the one that consumes
  68  */
  69 int
  70 ir3_delayslots(struct ir3_instruction *assigner,
  71                 struct ir3_instruction *consumer, unsigned n, bool soft)
  72 {
  73         if (ignore_dep(assigner, consumer, n))
  74                 return 0;
  75
  76         /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
  77          * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
  78          * handled with sync bits
  79          */
  80
  81         if (is_meta(assigner) || is_meta(consumer))
  82                 return 0;
  83
  84         if (writes_addr0(assigner) || writes_addr1(assigner))
  85                 return 6;
  86
  87         /* On a6xx, it takes the number of delay slots to get a SFU result
  88          * back (ie. using nop's instead of (ss) is:
  89          *
  90          *     8 - single warp
  91          *     9 - two warps
  92          *    10 - four warps
  93          *
  94          * and so on.  Not quite sure where it tapers out (ie. how many
  95          * warps share an SFU unit).  But 10 seems like a reasonable #
  96          * to choose:
  97          */
  98         if (soft && is_sfu(assigner))
  99                 return 10;
 100
 101         /* handled via sync flags: */
 102         if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 103                 return 0;
 104
 105         /* assigner must be alu: */
 106         if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 107                         is_mem(consumer)) {
 108                 return 6;
 109         } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 110                         (n == 3)) {
 111                 /* special case, 3rd src to cat3 not required on first cycle */
 112                 return 1;
 113         } else {
 114                 return 3;
 115         }
 116 }
 117
 118 static bool
 119 count_instruction(struct ir3_instruction *n)
 120 {
 121         /* NOTE: don't count branch/jump since we don't know yet if they will
 122          * be eliminated later in resolve_jumps().. really should do that
 123          * earlier so we don't have this constraint.
 124          */
 125         return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 126 }
 127
 128 /**
 129  * @block: the block to search in, starting from end; in first pass,
 130  *    this will be the block the instruction would be inserted into
 131  *    (but has not yet, ie. it only contains already scheduled
 132  *    instructions).  For intra-block scheduling (second pass), this
 133  *    would be one of the predecessor blocks.
 134  * @instr: the instruction to search for
 135  * @maxd:  max distance, bail after searching this # of instruction
 136  *    slots, since it means the instruction we are looking for is
 137  *    far enough away
 138  * @pred:  if true, recursively search into predecessor blocks to
 139  *    find the worst case (shortest) distance (only possible after
 140  *    individual blocks are all scheduled)
 141  */
 142 static unsigned
 143 distance(struct ir3_block *block, struct ir3_instruction *instr,
 144                 unsigned maxd, bool pred)
 145 {
 146         unsigned d = 0;
 147
 148         /* Note that this relies on incrementally building up the block's
 149          * instruction list.. but this is how scheduling and nopsched
 150          * work.
 151          */
 152         foreach_instr_rev (n, &block->instr_list) {
 153                 if ((n == instr) || (d >= maxd))
 154                         return MIN2(maxd, d + n->nop);
 155                 if (count_instruction(n))
 156                         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 157         }
 158
 159         /* if coming from a predecessor block, assume it is assigned far
 160          * enough away.. we'll fix up later.
 161          */
 162         if (!pred)
 163                 return maxd;
 164
 165         if (pred && (block->data != block)) {
 166                 /* Search into predecessor blocks, finding the one with the
 167                  * shortest distance, since that will be the worst case
 168                  */
 169                 unsigned min = maxd - d;
 170
 171                 /* (ab)use block->data to prevent recursion: */
 172                 block->data = block;
 173
 174                 set_foreach (block->predecessors, entry) {
 175                         struct ir3_block *pred = (struct ir3_block *)entry->key;
 176                         unsigned n;
 177
 178                         n = distance(pred, instr, min, pred);
 179
 180                         min = MIN2(min, n);
 181                 }
 182
 183                 block->data = NULL;
 184                 d += min;
 185         }
 186
 187         return d;
 188 }
 189
 190 /* calculate delay for specified src: */
 191 static unsigned
 192 delay_calc_srcn(struct ir3_block *block,
 193                 struct ir3_instruction *assigner,
 194                 struct ir3_instruction *consumer,
 195                 unsigned srcn, bool soft, bool pred)
 196 {
 197         unsigned delay = 0;
 198
 199         if (is_meta(assigner)) {
 200                 foreach_src (src, assigner) {
 201                         unsigned d;
 202
 203                         if (!src->instr)
 204                                 continue;
 205
 206                         d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
 207                         delay = MAX2(delay, d);
 208                 }
 209         } else {
 210                 delay = ir3_delayslots(assigner, consumer, srcn, soft);
 211                 delay -= distance(block, assigner, delay, pred);
 212         }
 213
 214         return delay;
 215 }
 216
 217 static struct ir3_instruction *
 218 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
 219 {
 220         unsigned d = 0;
 221
 222         /* Note that this relies on incrementally building up the block's
 223          * instruction list.. but this is how scheduling and nopsched
 224          * work.
 225          */
 226         foreach_instr_rev (n, &block->instr_list) {
 227                 if (d >= maxd)
 228                         return NULL;
 229                 if (count_instruction(n))
 230                         d++;
 231                 if (dest_regs(n) == 0)
 232                         continue;
 233
 234                 /* note that a dest reg will never be an immediate */
 235                 if (n->regs[0]->array.id == array_id)
 236                         return n;
 237         }
 238
 239         return NULL;
 240 }
 241
 242 /* like list_length() but only counts instructions which count in the
 243  * delay determination:
 244  */
 245 static unsigned
 246 count_block_delay(struct ir3_block *block)
 247 {
 248         unsigned delay = 0;
 249         foreach_instr (n, &block->instr_list) {
 250                 if (!count_instruction(n))
 251                         continue;
 252                 delay++;
 253         }
 254         return delay;
 255 }
 256
 257 static unsigned
 258 delay_calc_array(struct ir3_block *block, unsigned array_id,
 259                 struct ir3_instruction *consumer, unsigned srcn,
 260                 bool soft, bool pred, unsigned maxd)
 261 {
 262         struct ir3_instruction *assigner;
 263
 264         assigner = find_array_write(block, array_id, maxd);
 265         if (assigner)
 266                 return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
 267
 268         if (!pred)
 269                 return 0;
 270
 271         unsigned len = count_block_delay(block);
 272         if (maxd <= len)
 273                 return 0;
 274
 275         maxd -= len;
 276
 277         if (block->data == block) {
 278                 /* we have a loop, return worst case: */
 279                 return maxd;
 280         }
 281
 282         /* If we need to search into predecessors, find the one with the
 283          * max delay.. the resulting delay is that minus the number of
 284          * counted instructions in this block:
 285          */
 286         unsigned max = 0;
 287
 288         /* (ab)use block->data to prevent recursion: */
 289         block->data = block;
 290
 291         set_foreach (block->predecessors, entry) {
 292                 struct ir3_block *pred = (struct ir3_block *)entry->key;
 293                 unsigned delay =
 294                         delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
 295
 296                 max = MAX2(max, delay);
 297         }
 298
 299         block->data = NULL;
 300
 301         if (max < len)
 302                 return 0;
 303
 304         return max - len;
 305 }
 306
 307 /**
 308  * Calculate delay for instruction (maximum of delay for all srcs):
 309  *
 310  * @soft:  If true, add additional delay for situations where they
 311  *    would not be strictly required because a sync flag would be
 312  *    used (but scheduler would prefer to schedule some other
 313  *    instructions first to avoid stalling on sync flag)
 314  * @pred:  If true, recurse into predecessor blocks
 315  */
 316 unsigned
 317 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 318                 bool soft, bool pred)
 319 {
 320         unsigned delay = 0;
 321
 322         foreach_src_n (src, i, instr) {
 323                 unsigned d = 0;
 324
 325                 if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
 326                         d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
 327                 } else if (src->instr) {
 328                         d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
 329                 }
 330
 331                 delay = MAX2(delay, d);
 332         }
 333
 334         if (instr->address) {
 335                 unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
 336                 delay = MAX2(delay, d);
 337         }
 338
 339         return delay;
 340 }
 341
 342 /**
 343  * Remove nop instructions.  The scheduler can insert placeholder nop's
 344  * so that ir3_delay_calc() can account for nop's that won't be needed
 345  * due to nop's triggered by a previous instruction.  However, before
 346  * legalize, we want to remove these.  The legalize pass can insert
 347  * some nop's if needed to hold (for example) sync flags.  This final
 348  * remaining nops are inserted by legalize after this.
 349  */
 350 void
 351 ir3_remove_nops(struct ir3 *ir)
 352 {
 353         foreach_block (block, &ir->block_list) {
 354                 foreach_instr_safe (instr, &block->instr_list) {
 355                         if (instr->opc == OPC_NOP) {
 356                                 list_del(&instr->node);
 357                         }
 358                 }
 359         }
 360
 361 }