3ef67731926bffca5bd16d6b57f7afd3ff975f65
[mesa.git] / src / gallium / drivers / freedreno / a3xx / ir3_sched.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33
34 enum {
35 SCHEDULED = -1,
36 DELAYED = -2,
37 };
38
39 /*
40 * Instruction Scheduling:
41 *
42 * Using the depth sorted list from depth pass, attempt to recursively
43 * schedule deepest unscheduled path. The first instruction that cannot
44 * be scheduled, returns the required delay slots it needs, at which
45 * point we return back up to the top and attempt to schedule by next
46 * highest depth. After a sufficient number of instructions have been
47 * scheduled, return back to beginning of list and start again. If you
48 * reach the end of depth sorted list without being able to insert any
49 * instruction, insert nop's. Repeat until no more unscheduled
50 * instructions.
51 *
52 * There are a few special cases that need to be handled, since sched
53 * is currently independent of register allocation. Usages of address
54 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
55 * if you have two pairs of instructions that write the same special
56 * register and then read it, then those pairs cannot be interleaved.
57 * To solve this, when we are in such a scheduling "critical section",
58 * and we encounter a conflicting write to a special register, we try
59 * to schedule any remaining instructions that use that value first.
60 */
61
62 struct ir3_sched_ctx {
63 struct ir3_instruction *scheduled; /* last scheduled instr */
64 struct ir3_instruction *addr; /* current a0.x user, if any */
65 struct ir3_instruction *pred; /* current p0.x user, if any */
66 unsigned cnt;
67 };
68
69 static struct ir3_instruction *
70 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
71 {
72 struct ir3_instruction *d = NULL;
73 unsigned i = 0, id = 0;
74
75 while ((i < nsrcs) && !(d = srcs[id = i]))
76 i++;
77
78 if (!d)
79 return NULL;
80
81 for (; i < nsrcs; i++)
82 if (srcs[i] && (srcs[i]->depth > d->depth))
83 d = srcs[id = i];
84
85 srcs[id] = NULL;
86
87 return d;
88 }
89
90 static unsigned distance(struct ir3_sched_ctx *ctx,
91 struct ir3_instruction *instr, unsigned maxd)
92 {
93 struct ir3_instruction *n = ctx->scheduled;
94 unsigned d = 0;
95 while (n && (n != instr) && (d < maxd)) {
96 if (is_alu(n) || is_flow(n))
97 d++;
98 n = n->next;
99 }
100 return d;
101 }
102
103 /* TODO maybe we want double linked list? */
104 static struct ir3_instruction * prev(struct ir3_instruction *instr)
105 {
106 struct ir3_instruction *p = instr->block->head;
107 while (p && (p->next != instr))
108 p = p->next;
109 return p;
110 }
111
112 static void schedule(struct ir3_sched_ctx *ctx,
113 struct ir3_instruction *instr, bool remove)
114 {
115 struct ir3_block *block = instr->block;
116
117 /* maybe there is a better way to handle this than just stuffing
118 * a nop.. ideally we'd know about this constraint in the
119 * scheduling and depth calculation..
120 */
121 if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr))
122 schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
123
124 /* remove from depth list:
125 */
126 if (remove) {
127 struct ir3_instruction *p = prev(instr);
128
129 /* NOTE: this can happen for inputs which are not
130 * read.. in that case there is no need to schedule
131 * the input, so just bail:
132 */
133 if (instr != (p ? p->next : block->head))
134 return;
135
136 if (p)
137 p->next = instr->next;
138 else
139 block->head = instr->next;
140 }
141
142 if (writes_addr(instr)) {
143 assert(ctx->addr == NULL);
144 ctx->addr = instr;
145 }
146
147 if (writes_pred(instr)) {
148 assert(ctx->pred == NULL);
149 ctx->pred = instr;
150 }
151
152 instr->flags |= IR3_INSTR_MARK;
153
154 instr->next = ctx->scheduled;
155 ctx->scheduled = instr;
156
157 ctx->cnt++;
158 }
159
160 /*
161 * Delay-slot calculation. Follows fanin/fanout.
162 */
163
164 static unsigned delay_calc2(struct ir3_sched_ctx *ctx,
165 struct ir3_instruction *assigner,
166 struct ir3_instruction *consumer, unsigned srcn)
167 {
168 unsigned delay = 0;
169
170 if (is_meta(assigner)) {
171 unsigned i;
172 for (i = 1; i < assigner->regs_count; i++) {
173 struct ir3_register *reg = assigner->regs[i];
174 if (reg->flags & IR3_REG_SSA) {
175 unsigned d = delay_calc2(ctx, reg->instr,
176 consumer, srcn);
177 delay = MAX2(delay, d);
178 }
179 }
180 } else {
181 delay = ir3_delayslots(assigner, consumer, srcn);
182 delay -= distance(ctx, assigner, delay);
183 }
184
185 return delay;
186 }
187
188 static unsigned delay_calc(struct ir3_sched_ctx *ctx,
189 struct ir3_instruction *instr)
190 {
191 unsigned i, delay = 0;
192
193 for (i = 1; i < instr->regs_count; i++) {
194 struct ir3_register *reg = instr->regs[i];
195 if (reg->flags & IR3_REG_SSA) {
196 unsigned d = delay_calc2(ctx, reg->instr,
197 instr, i - 1);
198 delay = MAX2(delay, d);
199 }
200 }
201
202 return delay;
203 }
204
205 /* A negative return value signals that an instruction has been newly
206 * scheduled, return back up to the top of the stack (to block_sched())
207 */
208 static int trysched(struct ir3_sched_ctx *ctx,
209 struct ir3_instruction *instr)
210 {
211 struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1];
212 struct ir3_instruction *src;
213 unsigned i, delay, nsrcs = 0;
214
215 /* if already scheduled: */
216 if (instr->flags & IR3_INSTR_MARK)
217 return 0;
218
219 /* figure out our src's: */
220 for (i = 1; i < instr->regs_count; i++) {
221 struct ir3_register *reg = instr->regs[i];
222 if (reg->flags & IR3_REG_SSA)
223 srcs[nsrcs++] = reg->instr;
224 }
225
226 /* for each src register in sorted order:
227 */
228 delay = 0;
229 while ((src = deepest(srcs, nsrcs))) {
230 delay = trysched(ctx, src);
231 if (delay)
232 return delay;
233 }
234
235 /* all our dependents are scheduled, figure out if
236 * we have enough delay slots to schedule ourself:
237 */
238 delay = delay_calc(ctx, instr);
239 if (delay)
240 return delay;
241
242 /* if this is a write to address/predicate register, and that
243 * register is currently in use, we need to defer until it is
244 * free:
245 */
246 if (writes_addr(instr) && ctx->addr) {
247 assert(ctx->addr != instr);
248 return DELAYED;
249 }
250 if (writes_pred(instr) && ctx->pred) {
251 assert(ctx->pred != instr);
252 return DELAYED;
253 }
254
255 schedule(ctx, instr, true);
256 return SCHEDULED;
257 }
258
259 static struct ir3_instruction * reverse(struct ir3_instruction *instr)
260 {
261 struct ir3_instruction *reversed = NULL;
262 while (instr) {
263 struct ir3_instruction *next = instr->next;
264 instr->next = reversed;
265 reversed = instr;
266 instr = next;
267 }
268 return reversed;
269 }
270
271 static bool uses_current_addr(struct ir3_sched_ctx *ctx,
272 struct ir3_instruction *instr)
273 {
274 unsigned i;
275 for (i = 1; i < instr->regs_count; i++) {
276 struct ir3_register *reg = instr->regs[i];
277 if (reg->flags & IR3_REG_SSA) {
278 if (is_addr(reg->instr)) {
279 struct ir3_instruction *addr;
280 addr = reg->instr->regs[1]->instr; /* the mova */
281 if (ctx->addr == addr)
282 return true;
283 }
284 }
285 }
286 return false;
287 }
288
289 static bool uses_current_pred(struct ir3_sched_ctx *ctx,
290 struct ir3_instruction *instr)
291 {
292 unsigned i;
293 for (i = 1; i < instr->regs_count; i++) {
294 struct ir3_register *reg = instr->regs[i];
295 if ((reg->flags & IR3_REG_SSA) && (ctx->pred == reg->instr))
296 return true;
297 }
298 return false;
299 }
300
301 /* when we encounter an instruction that writes to the address register
302 * when it is in use, we delay that instruction and try to schedule all
303 * other instructions using the current address register:
304 */
305 static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
306 struct ir3_block *block)
307 {
308 struct ir3_instruction *instr = block->head;
309 bool addr_in_use = false;
310 bool pred_in_use = false;
311 unsigned cnt = ~0;
312
313 while (instr) {
314 struct ir3_instruction *next = instr->next;
315 bool addr = uses_current_addr(ctx, instr);
316 bool pred = uses_current_pred(ctx, instr);
317
318 if (addr || pred) {
319 int ret = trysched(ctx, instr);
320 if (ret == SCHEDULED)
321 cnt = 0;
322 else if (ret > 0)
323 cnt = MIN2(cnt, ret);
324 if (addr)
325 addr_in_use = true;
326 if (pred)
327 pred_in_use = true;
328 }
329
330 instr = next;
331 }
332
333 if (!addr_in_use)
334 ctx->addr = NULL;
335
336 if (!pred_in_use)
337 ctx->pred = NULL;
338
339 return cnt;
340 }
341
342 static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
343 {
344 struct ir3_instruction *instr;
345
346 /* schedule all the shader input's (meta-instr) first so that
347 * the RA step sees that the input registers contain a value
348 * from the start of the shader:
349 */
350 if (!block->parent) {
351 unsigned i;
352 for (i = 0; i < block->ninputs; i++) {
353 struct ir3_instruction *in = block->inputs[i];
354 if (in)
355 schedule(ctx, in, true);
356 }
357 }
358
359 while ((instr = block->head)) {
360 /* NOTE: always grab next *before* trysched(), in case the
361 * instruction is actually scheduled (and therefore moved
362 * from depth list into scheduled list)
363 */
364 struct ir3_instruction *next = instr->next;
365 int cnt = trysched(ctx, instr);
366
367 if (cnt == DELAYED)
368 cnt = block_sched_undelayed(ctx, block);
369
370 /* -1 is signal to return up stack, but to us means same as 0: */
371 cnt = MAX2(0, cnt);
372 cnt += ctx->cnt;
373 instr = next;
374
375 /* if deepest remaining instruction cannot be scheduled, try
376 * the increasingly more shallow instructions until needed
377 * number of delay slots is filled:
378 */
379 while (instr && (cnt > ctx->cnt)) {
380 next = instr->next;
381 trysched(ctx, instr);
382 instr = next;
383 }
384
385 /* and if we run out of instructions that can be scheduled,
386 * then it is time for nop's:
387 */
388 while (cnt > ctx->cnt)
389 schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
390 }
391
392 /* at this point, scheduled list is in reverse order, so fix that: */
393 block->head = reverse(ctx->scheduled);
394 }
395
396 void ir3_block_sched(struct ir3_block *block)
397 {
398 struct ir3_sched_ctx ctx = {0};
399 ir3_clear_mark(block->shader);
400 block_sched(&ctx, block);
401 }