freedreno/ir3: move nop padding to legalize
[mesa.git] / src / freedreno / ir3 / ir3_delay.c
1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "ir3.h"
28
29 /*
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
33 *
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
35 * src iterators work.
36 */
37
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
42 */
43 static bool
44 ignore_dep(struct ir3_instruction *assigner,
45 struct ir3_instruction *consumer, unsigned n)
46 {
47 if (!__is_false_dep(consumer, n))
48 return false;
49
50 if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
51 struct ir3_register *dst = assigner->regs[0];
52 struct ir3_register *src;
53
54 debug_assert(dst->flags & IR3_REG_ARRAY);
55
56 foreach_src (src, consumer) {
57 if ((src->flags & IR3_REG_ARRAY) &&
58 (dst->array.id == src->array.id)) {
59 return false;
60 }
61 }
62 }
63
64 return true;
65 }
66
67 /* calculate required # of delay slots between the instruction that
68 * assigns a value and the one that consumes
69 */
70 int
71 ir3_delayslots(struct ir3_instruction *assigner,
72 struct ir3_instruction *consumer, unsigned n)
73 {
74 if (ignore_dep(assigner, consumer, n))
75 return 0;
76
77 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
78 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
79 * handled with sync bits
80 */
81
82 if (is_meta(assigner) || is_meta(consumer))
83 return 0;
84
85 if (writes_addr(assigner))
86 return 6;
87
88 /* handled via sync flags: */
89 if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
90 return 0;
91
92 /* assigner must be alu: */
93 if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
94 is_mem(consumer)) {
95 return 6;
96 } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
97 (n == 3)) {
98 /* special case, 3rd src to cat3 not required on first cycle */
99 return 1;
100 } else {
101 return 3;
102 }
103 }
104
105 static bool
106 count_instruction(struct ir3_instruction *n)
107 {
108 /* NOTE: don't count branch/jump since we don't know yet if they will
109 * be eliminated later in resolve_jumps().. really should do that
110 * earlier so we don't have this constraint.
111 */
112 return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR));
113 }
114
115 /**
116 * @block: the block to search in, starting from end; in first pass,
117 * this will be the block the instruction would be inserted into
118 * (but has not yet, ie. it only contains already scheduled
119 * instructions). For intra-block scheduling (second pass), this
120 * would be one of the predecessor blocks.
121 * @instr: the instruction to search for
122 * @maxd: max distance, bail after searching this # of instruction
123 * slots, since it means the instruction we are looking for is
124 * far enough away
125 * @pred: if true, recursively search into predecessor blocks to
126 * find the worst case (shortest) distance (only possible after
127 * individual blocks are all scheduled)
128 */
129 unsigned
130 ir3_distance(struct ir3_block *block, struct ir3_instruction *instr,
131 unsigned maxd, bool pred)
132 {
133 unsigned d = 0;
134
135 /* Note that this relies on incrementally building up the block's
136 * instruction list.. but this is how scheduling and nopsched
137 * work.
138 */
139 foreach_instr_rev (n, &block->instr_list) {
140 if ((n == instr) || (d >= maxd))
141 return MIN2(maxd, d + n->nop);
142 if (count_instruction(n))
143 d = MIN2(maxd, d + 1 + n->repeat + n->nop);
144 }
145
146 /* if coming from a predecessor block, assume it is assigned far
147 * enough away.. we'll fix up later.
148 */
149 if (!pred)
150 return maxd;
151
152 if (pred && (block->data != block)) {
153 /* Search into predecessor blocks, finding the one with the
154 * shortest distance, since that will be the worst case
155 */
156 unsigned min = maxd - d;
157
158 /* (ab)use block->data to prevent recursion: */
159 block->data = block;
160
161 set_foreach (block->predecessors, entry) {
162 struct ir3_block *pred = (struct ir3_block *)entry->key;
163 unsigned n;
164
165 n = ir3_distance(pred, instr, min, pred);
166
167 min = MIN2(min, n);
168 }
169
170 block->data = NULL;
171 d += min;
172 }
173
174 return d;
175 }
176
177 /* calculate delay for specified src: */
178 static unsigned
179 delay_calc_srcn(struct ir3_block *block,
180 struct ir3_instruction *assigner,
181 struct ir3_instruction *consumer,
182 unsigned srcn, bool soft, bool pred)
183 {
184 unsigned delay = 0;
185
186 if (is_meta(assigner)) {
187 struct ir3_register *src;
188 foreach_src (src, assigner) {
189 unsigned d;
190
191 if (!src->instr)
192 continue;
193
194 d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
195 delay = MAX2(delay, d);
196 }
197 } else {
198 if (soft) {
199 if (is_sfu(assigner)) {
200 delay = 4;
201 } else {
202 delay = ir3_delayslots(assigner, consumer, srcn);
203 }
204 } else {
205 delay = ir3_delayslots(assigner, consumer, srcn);
206 }
207 delay -= ir3_distance(block, assigner, delay, pred);
208 }
209
210 return delay;
211 }
212
213 static struct ir3_instruction *
214 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
215 {
216 unsigned d = 0;
217
218 /* Note that this relies on incrementally building up the block's
219 * instruction list.. but this is how scheduling and nopsched
220 * work.
221 */
222 foreach_instr_rev (n, &block->instr_list) {
223 if (d >= maxd)
224 return NULL;
225 if (count_instruction(n))
226 d++;
227 if (dest_regs(n) == 0)
228 continue;
229
230 /* note that a dest reg will never be an immediate */
231 if (n->regs[0]->array.id == array_id)
232 return n;
233 }
234
235 return NULL;
236 }
237
238 /* like list_length() but only counts instructions which count in the
239 * delay determination:
240 */
241 static unsigned
242 count_block_delay(struct ir3_block *block)
243 {
244 unsigned delay = 0;
245 foreach_instr (n, &block->instr_list) {
246 if (!count_instruction(n))
247 continue;
248 delay++;
249 }
250 return delay;
251 }
252
253 static unsigned
254 delay_calc_array(struct ir3_block *block, unsigned array_id,
255 struct ir3_instruction *consumer, unsigned srcn,
256 bool soft, bool pred, unsigned maxd)
257 {
258 struct ir3_instruction *assigner;
259
260 assigner = find_array_write(block, array_id, maxd);
261 if (assigner)
262 return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
263
264 if (!pred)
265 return 0;
266
267 unsigned len = count_block_delay(block);
268 if (maxd <= len)
269 return 0;
270
271 maxd -= len;
272
273 if (block->data == block) {
274 /* we have a loop, return worst case: */
275 return maxd;
276 }
277
278 /* If we need to search into predecessors, find the one with the
279 * max delay.. the resulting delay is that minus the number of
280 * counted instructions in this block:
281 */
282 unsigned max = 0;
283
284 /* (ab)use block->data to prevent recursion: */
285 block->data = block;
286
287 set_foreach (block->predecessors, entry) {
288 struct ir3_block *pred = (struct ir3_block *)entry->key;
289 unsigned delay =
290 delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
291
292 max = MAX2(max, delay);
293 }
294
295 block->data = NULL;
296
297 if (max < len)
298 return 0;
299
300 return max - len;
301 }
302
303 /**
304 * Calculate delay for instruction (maximum of delay for all srcs):
305 *
306 * @soft: If true, add additional delay for situations where they
307 * would not be strictly required because a sync flag would be
308 * used (but scheduler would prefer to schedule some other
309 * instructions first to avoid stalling on sync flag)
310 * @pred: If true, recurse into predecessor blocks
311 */
312 unsigned
313 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
314 bool soft, bool pred)
315 {
316 unsigned delay = 0;
317 struct ir3_register *src;
318
319 foreach_src_n (src, i, instr) {
320 unsigned d = 0;
321
322 if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
323 d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
324 } else if (src->instr) {
325 d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
326 }
327
328 delay = MAX2(delay, d);
329 }
330
331 if (instr->address) {
332 unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
333 delay = MAX2(delay, d);
334 }
335
336 return delay;
337 }
338
339 /**
340 * Remove nop instructions. The scheduler can insert placeholder nop's
341 * so that ir3_delay_calc() can account for nop's that won't be needed
342 * due to nop's triggered by a previous instruction. However, before
343 * legalize, we want to remove these. The legalize pass can insert
344 * some nop's if needed to hold (for example) sync flags. This final
345 * remaining nops are inserted by legalize after this.
346 */
347 void
348 ir3_remove_nops(struct ir3 *ir)
349 {
350 foreach_block (block, &ir->block_list) {
351 foreach_instr_safe (instr, &block->instr_list) {
352 if (instr->opc == OPC_NOP) {
353 list_del(&instr->node);
354 }
355 }
356 }
357
358 }