i965: Note that write-after-write dependencies are blocking.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_schedule_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "brw_fs.h"
29 #include "glsl/glsl_types.h"
30 #include "glsl/ir_optimization.h"
31 #include "glsl/ir_print_visitor.h"
32
33 /** @file brw_fs_schedule_instructions.cpp
34 *
35 * List scheduling of FS instructions.
36 *
37 * The basic model of the list scheduler is to take a basic block,
38 * compute a DAG of the dependencies (RAW ordering with latency, WAW
39 * ordering with latency, WAR ordering), and make a list of the DAG heads.
40 * Heuristically pick a DAG head, then put all the children that are
41 * now DAG heads into the list of things to schedule.
42 *
43 * The heuristic is the important part. We're trying to be cheap,
44 * since actually computing the optimal scheduling is NP complete.
45 * What we do is track a "current clock". When we schedule a node, we
46 * update the earliest-unblocked clock time of its children, and
47 * increment the clock. Then, when trying to schedule, we just pick
48 * the earliest-unblocked instruction to schedule.
49 *
50 * Note that often there will be many things which could execute
51 * immediately, and there are a range of heuristic options to choose
52 * from in picking among those.
53 */
54
55 static bool debug = false;
56
57 class schedule_node : public exec_node
58 {
59 public:
60 schedule_node(fs_inst *inst, int gen)
61 {
62 this->inst = inst;
63 this->child_array_size = 0;
64 this->children = NULL;
65 this->child_latency = NULL;
66 this->child_count = 0;
67 this->parent_count = 0;
68 this->unblocked_time = 0;
69
70 if (gen >= 7)
71 set_latency_gen7();
72 else
73 set_latency_gen4();
74 }
75
76 void set_latency_gen4();
77 void set_latency_gen7();
78
79 fs_inst *inst;
80 schedule_node **children;
81 int *child_latency;
82 int child_count;
83 int parent_count;
84 int child_array_size;
85 int unblocked_time;
86 int latency;
87 };
88
89 void
90 schedule_node::set_latency_gen4()
91 {
92 int chans = 8;
93 int math_latency = 22;
94
95 switch (inst->opcode) {
96 case SHADER_OPCODE_RCP:
97 this->latency = 1 * chans * math_latency;
98 break;
99 case SHADER_OPCODE_RSQ:
100 this->latency = 2 * chans * math_latency;
101 break;
102 case SHADER_OPCODE_INT_QUOTIENT:
103 case SHADER_OPCODE_SQRT:
104 case SHADER_OPCODE_LOG2:
105 /* full precision log. partial is 2. */
106 this->latency = 3 * chans * math_latency;
107 break;
108 case SHADER_OPCODE_INT_REMAINDER:
109 case SHADER_OPCODE_EXP2:
110 /* full precision. partial is 3, same throughput. */
111 this->latency = 4 * chans * math_latency;
112 break;
113 case SHADER_OPCODE_POW:
114 this->latency = 8 * chans * math_latency;
115 break;
116 case SHADER_OPCODE_SIN:
117 case SHADER_OPCODE_COS:
118 /* minimum latency, max is 12 rounds. */
119 this->latency = 5 * chans * math_latency;
120 break;
121 default:
122 this->latency = 2;
123 break;
124 }
125 }
126
127 void
128 schedule_node::set_latency_gen7()
129 {
130 switch (inst->opcode) {
131 case BRW_OPCODE_MAD:
132 /* 3 cycles (this is said to be 4 cycles sometimes depending on the
133 * register numbers in the sources):
134 * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
135 *
136 * 20 cycles:
137 * mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
138 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
139 */
140 latency = 17;
141 break;
142
143 case SHADER_OPCODE_RCP:
144 case SHADER_OPCODE_RSQ:
145 case SHADER_OPCODE_SQRT:
146 case SHADER_OPCODE_LOG2:
147 case SHADER_OPCODE_EXP2:
148 case SHADER_OPCODE_SIN:
149 case SHADER_OPCODE_COS:
150 /* 2 cycles:
151 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
152 *
153 * 18 cycles:
154 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
155 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
156 *
157 * Same for exp2, log2, rsq, sqrt, sin, cos.
158 */
159 latency = 16;
160 break;
161
162 case SHADER_OPCODE_POW:
163 /* 2 cycles:
164 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
165 *
166 * 26 cycles:
167 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
168 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
169 */
170 latency = 24;
171 break;
172
173 case SHADER_OPCODE_TEX:
174 case SHADER_OPCODE_TXD:
175 case SHADER_OPCODE_TXF:
176 case SHADER_OPCODE_TXL:
177 /* 18 cycles:
178 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
179 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
180 * send(8) g4<1>UW g114<8,8,1>F
181 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
182 *
183 * 697 +/-49 cycles (min 610, n=26):
184 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
185 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
186 * send(8) g4<1>UW g114<8,8,1>F
187 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
188 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
189 *
190 * So the latency on our first texture load of the batchbuffer takes
191 * ~700 cycles, since the caches are cold at that point.
192 *
193 * 840 +/- 92 cycles (min 720, n=25):
194 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
195 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
196 * send(8) g4<1>UW g114<8,8,1>F
197 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
198 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
199 * send(8) g4<1>UW g114<8,8,1>F
200 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
201 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
202 *
203 * On the second load, it takes just an extra ~140 cycles, and after
204 * accounting for the 14 cycles of the MOV's latency, that makes ~130.
205 *
206 * 683 +/- 49 cycles (min = 602, n=47):
207 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
208 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
209 * send(8) g4<1>UW g114<8,8,1>F
210 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
211 * send(8) g50<1>UW g114<8,8,1>F
212 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
213 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
214 *
215 * The unit appears to be pipelined, since this matches up with the
216 * cache-cold case, despite there being two loads here. If you replace
217 * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
218 *
219 * So, take some number between the cache-hot 140 cycles and the
220 * cache-cold 700 cycles. No particular tuning was done on this.
221 *
222 * I haven't done significant testing of the non-TEX opcodes. TXL at
223 * least looked about the same as TEX.
224 */
225 latency = 200;
226 break;
227
228 case SHADER_OPCODE_TXS:
229 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
230 * cycles (n=15):
231 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
232 * send(8) g6<1>UW g114<8,8,1>F
233 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
234 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
235 *
236 *
237 * Two loads was 535 +/- 30 cycles (n=19):
238 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
239 * send(16) g6<1>UW g114<8,8,1>F
240 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
241 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
242 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
243 * send(16) g8<1>UW g114<8,8,1>F
244 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
245 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
246 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
247 *
248 * Since the only caches that should matter are just the
249 * instruction/state cache containing the surface state, assume that we
250 * always have hot caches.
251 */
252 latency = 100;
253 break;
254
255 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
256 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
257 /* testing using varying-index pull constants:
258 *
259 * 16 cycles:
260 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
261 * send(8) g4<1>F g4<8,8,1>D
262 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
263 *
264 * ~480 cycles:
265 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
266 * send(8) g4<1>F g4<8,8,1>D
267 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
268 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
269 *
270 * ~620 cycles:
271 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
272 * send(8) g4<1>F g4<8,8,1>D
273 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
274 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
275 * send(8) g4<1>F g4<8,8,1>D
276 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
277 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
278 *
279 * So, if it's cache-hot, it's about 140. If it's cache cold, it's
280 * about 460. We expect to mostly be cache hot, so pick something more
281 * in that direction.
282 */
283 latency = 200;
284 break;
285
286 default:
287 /* 2 cycles:
288 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
289 *
290 * 16 cycles:
291 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
292 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
293 */
294 latency = 14;
295 break;
296 }
297 }
298
299 class instruction_scheduler {
300 public:
301 instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
302 bool post_reg_alloc)
303 {
304 this->v = v;
305 this->mem_ctx = ralloc_context(mem_ctx);
306 this->grf_count = grf_count;
307 this->instructions.make_empty();
308 this->instructions_to_schedule = 0;
309 this->post_reg_alloc = post_reg_alloc;
310 }
311
312 ~instruction_scheduler()
313 {
314 ralloc_free(this->mem_ctx);
315 }
316 void add_barrier_deps(schedule_node *n);
317 void add_dep(schedule_node *before, schedule_node *after, int latency);
318 void add_dep(schedule_node *before, schedule_node *after);
319
320 void add_inst(fs_inst *inst);
321 void calculate_deps();
322 void schedule_instructions(fs_inst *next_block_header);
323
324 bool is_compressed(fs_inst *inst);
325
326 void *mem_ctx;
327
328 bool post_reg_alloc;
329 int instructions_to_schedule;
330 int grf_count;
331 exec_list instructions;
332 fs_visitor *v;
333 };
334
335 void
336 instruction_scheduler::add_inst(fs_inst *inst)
337 {
338 schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
339
340 assert(!inst->is_head_sentinel());
341 assert(!inst->is_tail_sentinel());
342
343 this->instructions_to_schedule++;
344
345 inst->remove();
346 instructions.push_tail(n);
347 }
348
349 /**
350 * Add a dependency between two instruction nodes.
351 *
352 * The @after node will be scheduled after @before. We will try to
353 * schedule it @latency cycles after @before, but no guarantees there.
354 */
355 void
356 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
357 int latency)
358 {
359 if (!before || !after)
360 return;
361
362 assert(before != after);
363
364 for (int i = 0; i < before->child_count; i++) {
365 if (before->children[i] == after) {
366 before->child_latency[i] = MAX2(before->child_latency[i], latency);
367 return;
368 }
369 }
370
371 if (before->child_array_size <= before->child_count) {
372 if (before->child_array_size < 16)
373 before->child_array_size = 16;
374 else
375 before->child_array_size *= 2;
376
377 before->children = reralloc(mem_ctx, before->children,
378 schedule_node *,
379 before->child_array_size);
380 before->child_latency = reralloc(mem_ctx, before->child_latency,
381 int, before->child_array_size);
382 }
383
384 before->children[before->child_count] = after;
385 before->child_latency[before->child_count] = latency;
386 before->child_count++;
387 after->parent_count++;
388 }
389
390 void
391 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
392 {
393 if (!before)
394 return;
395
396 add_dep(before, after, before->latency);
397 }
398
399 /**
400 * Sometimes we really want this node to execute after everything that
401 * was before it and before everything that followed it. This adds
402 * the deps to do so.
403 */
404 void
405 instruction_scheduler::add_barrier_deps(schedule_node *n)
406 {
407 schedule_node *prev = (schedule_node *)n->prev;
408 schedule_node *next = (schedule_node *)n->next;
409
410 if (prev) {
411 while (!prev->is_head_sentinel()) {
412 add_dep(prev, n, 0);
413 prev = (schedule_node *)prev->prev;
414 }
415 }
416
417 if (next) {
418 while (!next->is_tail_sentinel()) {
419 add_dep(n, next, 0);
420 next = (schedule_node *)next->next;
421 }
422 }
423 }
424
425 /* instruction scheduling needs to be aware of when an MRF write
426 * actually writes 2 MRFs.
427 */
428 bool
429 instruction_scheduler::is_compressed(fs_inst *inst)
430 {
431 return (v->dispatch_width == 16 &&
432 !inst->force_uncompressed &&
433 !inst->force_sechalf);
434 }
435
436 void
437 instruction_scheduler::calculate_deps()
438 {
439 /* Pre-register-allocation, this tracks the last write per VGRF (so
440 * different reg_offsets within it can interfere when they shouldn't).
441 * After register allocation, reg_offsets are gone and we track individual
442 * GRF registers.
443 */
444 schedule_node *last_grf_write[grf_count];
445 schedule_node *last_mrf_write[BRW_MAX_MRF];
446 schedule_node *last_conditional_mod[2] = { NULL, NULL };
447 /* Fixed HW registers are assumed to be separate from the virtual
448 * GRFs, so they can be tracked separately. We don't really write
449 * to fixed GRFs much, so don't bother tracking them on a more
450 * granular level.
451 */
452 schedule_node *last_fixed_grf_write = NULL;
453 int reg_width = v->dispatch_width / 8;
454
455 /* The last instruction always needs to still be the last
456 * instruction. Either it's flow control (IF, ELSE, ENDIF, DO,
457 * WHILE) and scheduling other things after it would disturb the
458 * basic block, or it's FB_WRITE and we should do a better job at
459 * dead code elimination anyway.
460 */
461 schedule_node *last = (schedule_node *)instructions.get_tail();
462 add_barrier_deps(last);
463
464 memset(last_grf_write, 0, sizeof(last_grf_write));
465 memset(last_mrf_write, 0, sizeof(last_mrf_write));
466
467 /* top-to-bottom dependencies: RAW and WAW. */
468 foreach_list(node, &instructions) {
469 schedule_node *n = (schedule_node *)node;
470 fs_inst *inst = n->inst;
471
472 /* read-after-write deps. */
473 for (int i = 0; i < 3; i++) {
474 if (inst->src[i].file == GRF) {
475 if (post_reg_alloc) {
476 for (int r = 0; r < reg_width; r++)
477 add_dep(last_grf_write[inst->src[i].reg + r], n);
478 } else {
479 add_dep(last_grf_write[inst->src[i].reg], n);
480 }
481 } else if (inst->src[i].file == FIXED_HW_REG &&
482 (inst->src[i].fixed_hw_reg.file ==
483 BRW_GENERAL_REGISTER_FILE)) {
484 if (post_reg_alloc) {
485 for (int r = 0; r < reg_width; r++)
486 add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
487 } else {
488 add_dep(last_fixed_grf_write, n);
489 }
490 } else if (inst->src[i].file != BAD_FILE &&
491 inst->src[i].file != IMM &&
492 inst->src[i].file != UNIFORM) {
493 assert(inst->src[i].file != MRF);
494 add_barrier_deps(n);
495 }
496 }
497
498 for (int i = 0; i < inst->mlen; i++) {
499 /* It looks like the MRF regs are released in the send
500 * instruction once it's sent, not when the result comes
501 * back.
502 */
503 add_dep(last_mrf_write[inst->base_mrf + i], n);
504 }
505
506 if (inst->predicate) {
507 add_dep(last_conditional_mod[inst->flag_subreg], n);
508 }
509
510 /* write-after-write deps. */
511 if (inst->dst.file == GRF) {
512 if (post_reg_alloc) {
513 for (int r = 0; r < inst->regs_written() * reg_width; r++) {
514 add_dep(last_grf_write[inst->dst.reg + r], n);
515 last_grf_write[inst->dst.reg + r] = n;
516 }
517 } else {
518 add_dep(last_grf_write[inst->dst.reg], n);
519 last_grf_write[inst->dst.reg] = n;
520 }
521 } else if (inst->dst.file == MRF) {
522 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
523
524 add_dep(last_mrf_write[reg], n);
525 last_mrf_write[reg] = n;
526 if (is_compressed(inst)) {
527 if (inst->dst.reg & BRW_MRF_COMPR4)
528 reg += 4;
529 else
530 reg++;
531 add_dep(last_mrf_write[reg], n);
532 last_mrf_write[reg] = n;
533 }
534 } else if (inst->dst.file == FIXED_HW_REG &&
535 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
536 if (post_reg_alloc) {
537 for (int r = 0; r < reg_width; r++)
538 last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
539 } else {
540 last_fixed_grf_write = n;
541 }
542 } else if (inst->dst.file != BAD_FILE) {
543 add_barrier_deps(n);
544 }
545
546 if (inst->mlen > 0) {
547 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
548 add_dep(last_mrf_write[inst->base_mrf + i], n);
549 last_mrf_write[inst->base_mrf + i] = n;
550 }
551 }
552
553 /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
554 * conditional_mod, because it sets the flag register.
555 */
556 if (inst->conditional_mod ||
557 inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
558 add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
559 last_conditional_mod[inst->flag_subreg] = n;
560 }
561 }
562
563 /* bottom-to-top dependencies: WAR */
564 memset(last_grf_write, 0, sizeof(last_grf_write));
565 memset(last_mrf_write, 0, sizeof(last_mrf_write));
566 memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
567 last_fixed_grf_write = NULL;
568
569 exec_node *node;
570 exec_node *prev;
571 for (node = instructions.get_tail(), prev = node->prev;
572 !node->is_head_sentinel();
573 node = prev, prev = node->prev) {
574 schedule_node *n = (schedule_node *)node;
575 fs_inst *inst = n->inst;
576
577 /* write-after-read deps. */
578 for (int i = 0; i < 3; i++) {
579 if (inst->src[i].file == GRF) {
580 if (post_reg_alloc) {
581 for (int r = 0; r < reg_width; r++)
582 add_dep(n, last_grf_write[inst->src[i].reg + r]);
583 } else {
584 add_dep(n, last_grf_write[inst->src[i].reg]);
585 }
586 } else if (inst->src[i].file == FIXED_HW_REG &&
587 (inst->src[i].fixed_hw_reg.file ==
588 BRW_GENERAL_REGISTER_FILE)) {
589 if (post_reg_alloc) {
590 for (int r = 0; r < reg_width; r++)
591 add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
592 } else {
593 add_dep(n, last_fixed_grf_write);
594 }
595 } else if (inst->src[i].file != BAD_FILE &&
596 inst->src[i].file != IMM &&
597 inst->src[i].file != UNIFORM) {
598 assert(inst->src[i].file != MRF);
599 add_barrier_deps(n);
600 }
601 }
602
603 for (int i = 0; i < inst->mlen; i++) {
604 /* It looks like the MRF regs are released in the send
605 * instruction once it's sent, not when the result comes
606 * back.
607 */
608 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
609 }
610
611 if (inst->predicate) {
612 add_dep(n, last_conditional_mod[inst->flag_subreg]);
613 }
614
615 /* Update the things this instruction wrote, so earlier reads
616 * can mark this as WAR dependency.
617 */
618 if (inst->dst.file == GRF) {
619 if (post_reg_alloc) {
620 for (int r = 0; r < inst->regs_written() * reg_width; r++)
621 last_grf_write[inst->dst.reg + r] = n;
622 } else {
623 last_grf_write[inst->dst.reg] = n;
624 }
625 } else if (inst->dst.file == MRF) {
626 int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
627
628 last_mrf_write[reg] = n;
629
630 if (is_compressed(inst)) {
631 if (inst->dst.reg & BRW_MRF_COMPR4)
632 reg += 4;
633 else
634 reg++;
635
636 last_mrf_write[reg] = n;
637 }
638 } else if (inst->dst.file == FIXED_HW_REG &&
639 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
640 if (post_reg_alloc) {
641 for (int r = 0; r < reg_width; r++)
642 last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
643 } else {
644 last_fixed_grf_write = n;
645 }
646 } else if (inst->dst.file != BAD_FILE) {
647 add_barrier_deps(n);
648 }
649
650 if (inst->mlen > 0) {
651 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
652 last_mrf_write[inst->base_mrf + i] = n;
653 }
654 }
655
656 /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
657 * conditional_mod, because it sets the flag register.
658 */
659 if (inst->conditional_mod ||
660 inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
661 last_conditional_mod[inst->flag_subreg] = n;
662 }
663 }
664 }
665
666 void
667 instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
668 {
669 int time = 0;
670
671 /* Remove non-DAG heads from the list. */
672 foreach_list_safe(node, &instructions) {
673 schedule_node *n = (schedule_node *)node;
674 if (n->parent_count != 0)
675 n->remove();
676 }
677
678 while (!instructions.is_empty()) {
679 schedule_node *chosen = NULL;
680 int chosen_time = 0;
681
682 if (post_reg_alloc) {
683 /* Of the instructions closest ready to execute or the closest to
684 * being ready, choose the oldest one.
685 */
686 foreach_list(node, &instructions) {
687 schedule_node *n = (schedule_node *)node;
688
689 if (!chosen || n->unblocked_time < chosen_time) {
690 chosen = n;
691 chosen_time = n->unblocked_time;
692 }
693 }
694 } else {
695 /* Before register allocation, we don't care about the latencies of
696 * instructions. All we care about is reducing live intervals of
697 * variables so that we can avoid register spilling, or get 16-wide
698 * shaders which naturally do a better job of hiding instruction
699 * latency.
700 *
701 * To do so, schedule our instructions in a roughly LIFO/depth-first
702 * order: when new instructions become available as a result of
703 * scheduling something, choose those first so that our result
704 * hopefully is consumed quickly.
705 *
706 * The exception is messages that generate more than one result
707 * register (AKA texturing). In those cases, the LIFO search would
708 * normally tend to choose them quickly (because scheduling the
709 * previous message not only unblocked the children using its result,
710 * but also the MRF setup for the next sampler message, which in turn
711 * unblocks the next sampler message).
712 */
713 for (schedule_node *node = (schedule_node *)instructions.get_tail();
714 node != instructions.get_head()->prev;
715 node = (schedule_node *)node->prev) {
716 schedule_node *n = (schedule_node *)node;
717
718 chosen = n;
719 if (chosen->inst->regs_written() <= 1)
720 break;
721 }
722
723 chosen_time = chosen->unblocked_time;
724 }
725
726 /* Schedule this instruction. */
727 assert(chosen);
728 chosen->remove();
729 next_block_header->insert_before(chosen->inst);
730 instructions_to_schedule--;
731
732 /* Bump the clock. Instructions in gen hardware are handled one simd4
733 * vector at a time, with 1 cycle per vector dispatched. Thus 8-wide
734 * pixel shaders take 2 cycles to dispatch and 16-wide (compressed)
735 * instructions take 4.
736 */
737 if (is_compressed(chosen->inst))
738 time += 4;
739 else
740 time += 2;
741
742 /* If we expected a delay for scheduling, then bump the clock to reflect
743 * that as well. In reality, the hardware will switch to another
744 * hyperthread and may not return to dispatching our thread for a while
745 * even after we're unblocked.
746 */
747 time = MAX2(time, chosen_time);
748
749 if (debug) {
750 printf("clock %4d, scheduled: ", time);
751 v->dump_instruction(chosen->inst);
752 }
753
754 /* Now that we've scheduled a new instruction, some of its
755 * children can be promoted to the list of instructions ready to
756 * be scheduled. Update the children's unblocked time for this
757 * DAG edge as we do so.
758 */
759 for (int i = 0; i < chosen->child_count; i++) {
760 schedule_node *child = chosen->children[i];
761
762 child->unblocked_time = MAX2(child->unblocked_time,
763 time + chosen->child_latency[i]);
764
765 child->parent_count--;
766 if (child->parent_count == 0) {
767 if (debug) {
768 printf("now available: ");
769 v->dump_instruction(child->inst);
770 }
771 instructions.push_tail(child);
772 }
773 }
774
775 /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+
776 * but it's more limited pre-gen6, so if we send something off to it then
777 * the next math instruction isn't going to make progress until the first
778 * is done.
779 */
780 if (chosen->inst->is_math()) {
781 foreach_list(node, &instructions) {
782 schedule_node *n = (schedule_node *)node;
783
784 if (n->inst->is_math())
785 n->unblocked_time = MAX2(n->unblocked_time,
786 time + chosen->latency);
787 }
788 }
789 }
790
791 assert(instructions_to_schedule == 0);
792 }
793
794 void
795 fs_visitor::schedule_instructions(bool post_reg_alloc)
796 {
797 fs_inst *next_block_header = (fs_inst *)instructions.head;
798
799 int grf_count;
800 if (post_reg_alloc)
801 grf_count = grf_used;
802 else
803 grf_count = virtual_grf_count;
804
805 if (debug) {
806 printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
807 dump_instructions();
808 }
809
810 instruction_scheduler sched(this, mem_ctx, grf_count, post_reg_alloc);
811
812 while (!next_block_header->is_tail_sentinel()) {
813 /* Add things to be scheduled until we get to a new BB. */
814 while (!next_block_header->is_tail_sentinel()) {
815 fs_inst *inst = next_block_header;
816 next_block_header = (fs_inst *)next_block_header->next;
817
818 sched.add_inst(inst);
819 if (inst->is_control_flow())
820 break;
821 }
822 sched.calculate_deps();
823 sched.schedule_instructions(next_block_header);
824 }
825
826 if (debug) {
827 printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
828 dump_instructions();
829 }
830
831 this->live_intervals_valid = false;
832 }