intel/fs: Use SHADER_OPCODE_SEND for surface messages
[mesa.git] / src / intel / compiler / brw_schedule_instructions.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "brw_fs.h"
29 #include "brw_fs_live_variables.h"
30 #include "brw_vec4.h"
31 #include "brw_cfg.h"
32 #include "brw_shader.h"
33
34 using namespace brw;
35
36 /** @file brw_fs_schedule_instructions.cpp
37 *
38 * List scheduling of FS instructions.
39 *
40 * The basic model of the list scheduler is to take a basic block,
41 * compute a DAG of the dependencies (RAW ordering with latency, WAW
42 * ordering with latency, WAR ordering), and make a list of the DAG heads.
43 * Heuristically pick a DAG head, then put all the children that are
44 * now DAG heads into the list of things to schedule.
45 *
46 * The heuristic is the important part. We're trying to be cheap,
47 * since actually computing the optimal scheduling is NP complete.
48 * What we do is track a "current clock". When we schedule a node, we
49 * update the earliest-unblocked clock time of its children, and
50 * increment the clock. Then, when trying to schedule, we just pick
51 * the earliest-unblocked instruction to schedule.
52 *
53 * Note that often there will be many things which could execute
54 * immediately, and there are a range of heuristic options to choose
55 * from in picking among those.
56 */
57
58 static bool debug = false;
59
60 class instruction_scheduler;
61
62 class schedule_node : public exec_node
63 {
64 public:
65 schedule_node(backend_instruction *inst, instruction_scheduler *sched);
66 void set_latency_gen4();
67 void set_latency_gen7(bool is_haswell);
68
69 backend_instruction *inst;
70 schedule_node **children;
71 int *child_latency;
72 int child_count;
73 int parent_count;
74 int child_array_size;
75 int unblocked_time;
76 int latency;
77
78 /**
79 * Which iteration of pushing groups of children onto the candidates list
80 * this node was a part of.
81 */
82 unsigned cand_generation;
83
84 /**
85 * This is the sum of the instruction's latency plus the maximum delay of
86 * its children, or just the issue_time if it's a leaf node.
87 */
88 int delay;
89
90 /**
91 * Preferred exit node among the (direct or indirect) successors of this
92 * node. Among the scheduler nodes blocked by this node, this will be the
93 * one that may cause earliest program termination, or NULL if none of the
94 * successors is an exit node.
95 */
96 schedule_node *exit;
97 };
98
99 /**
100 * Lower bound of the scheduling time after which one of the instructions
101 * blocked by this node may lead to program termination.
102 *
103 * exit_unblocked_time() determines a strict partial ordering relation '«' on
104 * the set of scheduler nodes as follows:
105 *
106 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
107 *
108 * which can be used to heuristically order nodes according to how early they
109 * can unblock an exit node and lead to program termination.
110 */
111 static inline int
112 exit_unblocked_time(const schedule_node *n)
113 {
114 return n->exit ? n->exit->unblocked_time : INT_MAX;
115 }
116
117 void
118 schedule_node::set_latency_gen4()
119 {
120 int chans = 8;
121 int math_latency = 22;
122
123 switch (inst->opcode) {
124 case SHADER_OPCODE_RCP:
125 this->latency = 1 * chans * math_latency;
126 break;
127 case SHADER_OPCODE_RSQ:
128 this->latency = 2 * chans * math_latency;
129 break;
130 case SHADER_OPCODE_INT_QUOTIENT:
131 case SHADER_OPCODE_SQRT:
132 case SHADER_OPCODE_LOG2:
133 /* full precision log. partial is 2. */
134 this->latency = 3 * chans * math_latency;
135 break;
136 case SHADER_OPCODE_INT_REMAINDER:
137 case SHADER_OPCODE_EXP2:
138 /* full precision. partial is 3, same throughput. */
139 this->latency = 4 * chans * math_latency;
140 break;
141 case SHADER_OPCODE_POW:
142 this->latency = 8 * chans * math_latency;
143 break;
144 case SHADER_OPCODE_SIN:
145 case SHADER_OPCODE_COS:
146 /* minimum latency, max is 12 rounds. */
147 this->latency = 5 * chans * math_latency;
148 break;
149 default:
150 this->latency = 2;
151 break;
152 }
153 }
154
155 void
156 schedule_node::set_latency_gen7(bool is_haswell)
157 {
158 switch (inst->opcode) {
159 case BRW_OPCODE_MAD:
160 /* 2 cycles
161 * (since the last two src operands are in different register banks):
162 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
163 *
164 * 3 cycles on IVB, 4 on HSW
165 * (since the last two src operands are in the same register bank):
166 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
167 *
168 * 18 cycles on IVB, 16 on HSW
169 * (since the last two src operands are in different register banks):
170 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
171 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q };
172 *
173 * 20 cycles on IVB, 18 on HSW
174 * (since the last two src operands are in the same register bank):
175 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
176 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
177 */
178
179 /* Our register allocator doesn't know about register banks, so use the
180 * higher latency.
181 */
182 latency = is_haswell ? 16 : 18;
183 break;
184
185 case BRW_OPCODE_LRP:
186 /* 2 cycles
187 * (since the last two src operands are in different register banks):
188 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
189 *
190 * 3 cycles on IVB, 4 on HSW
191 * (since the last two src operands are in the same register bank):
192 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
193 *
194 * 16 cycles on IVB, 14 on HSW
195 * (since the last two src operands are in different register banks):
196 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
197 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
198 *
199 * 16 cycles
200 * (since the last two src operands are in the same register bank):
201 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
202 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
203 */
204
205 /* Our register allocator doesn't know about register banks, so use the
206 * higher latency.
207 */
208 latency = 14;
209 break;
210
211 case SHADER_OPCODE_RCP:
212 case SHADER_OPCODE_RSQ:
213 case SHADER_OPCODE_SQRT:
214 case SHADER_OPCODE_LOG2:
215 case SHADER_OPCODE_EXP2:
216 case SHADER_OPCODE_SIN:
217 case SHADER_OPCODE_COS:
218 /* 2 cycles:
219 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
220 *
221 * 18 cycles:
222 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
223 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
224 *
225 * Same for exp2, log2, rsq, sqrt, sin, cos.
226 */
227 latency = is_haswell ? 14 : 16;
228 break;
229
230 case SHADER_OPCODE_POW:
231 /* 2 cycles:
232 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
233 *
234 * 26 cycles:
235 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
236 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
237 */
238 latency = is_haswell ? 22 : 24;
239 break;
240
241 case SHADER_OPCODE_TEX:
242 case SHADER_OPCODE_TXD:
243 case SHADER_OPCODE_TXF:
244 case SHADER_OPCODE_TXF_LZ:
245 case SHADER_OPCODE_TXL:
246 case SHADER_OPCODE_TXL_LZ:
247 /* 18 cycles:
248 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
249 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
250 * send(8) g4<1>UW g114<8,8,1>F
251 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
252 *
253 * 697 +/-49 cycles (min 610, n=26):
254 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
255 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
256 * send(8) g4<1>UW g114<8,8,1>F
257 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
258 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
259 *
260 * So the latency on our first texture load of the batchbuffer takes
261 * ~700 cycles, since the caches are cold at that point.
262 *
263 * 840 +/- 92 cycles (min 720, n=25):
264 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
265 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
266 * send(8) g4<1>UW g114<8,8,1>F
267 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
268 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
269 * send(8) g4<1>UW g114<8,8,1>F
270 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
271 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
272 *
273 * On the second load, it takes just an extra ~140 cycles, and after
274 * accounting for the 14 cycles of the MOV's latency, that makes ~130.
275 *
276 * 683 +/- 49 cycles (min = 602, n=47):
277 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
278 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
279 * send(8) g4<1>UW g114<8,8,1>F
280 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
281 * send(8) g50<1>UW g114<8,8,1>F
282 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
283 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
284 *
285 * The unit appears to be pipelined, since this matches up with the
286 * cache-cold case, despite there being two loads here. If you replace
287 * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
288 *
289 * So, take some number between the cache-hot 140 cycles and the
290 * cache-cold 700 cycles. No particular tuning was done on this.
291 *
292 * I haven't done significant testing of the non-TEX opcodes. TXL at
293 * least looked about the same as TEX.
294 */
295 latency = 200;
296 break;
297
298 case SHADER_OPCODE_TXS:
299 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
300 * cycles (n=15):
301 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
302 * send(8) g6<1>UW g114<8,8,1>F
303 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
304 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
305 *
306 *
307 * Two loads was 535 +/- 30 cycles (n=19):
308 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
309 * send(16) g6<1>UW g114<8,8,1>F
310 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
311 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
312 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
313 * send(16) g8<1>UW g114<8,8,1>F
314 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
315 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
316 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
317 *
318 * Since the only caches that should matter are just the
319 * instruction/state cache containing the surface state, assume that we
320 * always have hot caches.
321 */
322 latency = 100;
323 break;
324
325 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
326 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
327 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
328 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
329 case VS_OPCODE_PULL_CONSTANT_LOAD:
330 /* testing using varying-index pull constants:
331 *
332 * 16 cycles:
333 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
334 * send(8) g4<1>F g4<8,8,1>D
335 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
336 *
337 * ~480 cycles:
338 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
339 * send(8) g4<1>F g4<8,8,1>D
340 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
341 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
342 *
343 * ~620 cycles:
344 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
345 * send(8) g4<1>F g4<8,8,1>D
346 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
347 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
348 * send(8) g4<1>F g4<8,8,1>D
349 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
350 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
351 *
352 * So, if it's cache-hot, it's about 140. If it's cache cold, it's
353 * about 460. We expect to mostly be cache hot, so pick something more
354 * in that direction.
355 */
356 latency = 200;
357 break;
358
359 case SHADER_OPCODE_GEN7_SCRATCH_READ:
360 /* Testing a load from offset 0, that had been previously written:
361 *
362 * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
363 * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q };
364 *
365 * The cycles spent seemed to be grouped around 40-50 (as low as 38),
366 * then around 140. Presumably this is cache hit vs miss.
367 */
368 latency = 50;
369 break;
370
371 case SHADER_OPCODE_UNTYPED_ATOMIC:
372 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
373 case SHADER_OPCODE_TYPED_ATOMIC:
374 /* Test code:
375 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q };
376 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all };
377 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q };
378 * send(8) g4<1>ud g112<8,8,1>ud
379 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q };
380 *
381 * Running it 100 times as fragment shader on a 128x128 quad
382 * gives an average latency of 13867 cycles per atomic op,
383 * standard deviation 3%. Note that this is a rather
384 * pessimistic estimate, the actual latency in cases with few
385 * collisions between threads and favorable pipelining has been
386 * seen to be reduced by a factor of 100.
387 */
388 latency = 14000;
389 break;
390
391 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
392 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
393 case SHADER_OPCODE_TYPED_SURFACE_READ:
394 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
395 /* Test code:
396 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
397 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
398 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
399 * send(8) g4<1>UD g112<8,8,1>UD
400 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
401 * .
402 * . [repeats 8 times]
403 * .
404 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
405 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
406 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
407 * send(8) g4<1>UD g112<8,8,1>UD
408 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
409 *
410 * Running it 100 times as fragment shader on a 128x128 quad
411 * gives an average latency of 583 cycles per surface read,
412 * standard deviation 0.9%.
413 */
414 latency = is_haswell ? 300 : 600;
415 break;
416
417 case SHADER_OPCODE_SEND:
418 switch (inst->sfid) {
419 case GEN6_SFID_DATAPORT_RENDER_CACHE:
420 switch ((inst->desc >> 14) & 0x1f) {
421 case GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE:
422 case GEN7_DATAPORT_RC_TYPED_SURFACE_READ:
423 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */
424 assert(!is_haswell);
425 latency = 600;
426 break;
427
428 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
429 /* See also SHADER_OPCODE_TYPED_ATOMIC */
430 assert(!is_haswell);
431 latency = 14000;
432 break;
433
434 default:
435 unreachable("Unknown render cache message");
436 }
437 break;
438
439 case GEN7_SFID_DATAPORT_DATA_CACHE:
440 switch ((inst->desc >> 14) & 0x1f) {
441 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ:
442 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE:
443 /* We have no data for this but assume it's roughly the same as
444 * untyped surface read/write.
445 */
446 latency = 300;
447 break;
448
449 case GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ:
450 case GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE:
451 /* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */
452 assert(!is_haswell);
453 latency = 600;
454 break;
455
456 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
457 /* See also SHADER_OPCODE_UNTYPED_ATOMIC */
458 assert(!is_haswell);
459 latency = 14000;
460 break;
461
462 default:
463 unreachable("Unknown data cache message");
464 }
465 break;
466
467 case HSW_SFID_DATAPORT_DATA_CACHE_1:
468 switch ((inst->desc >> 14) & 0x1f) {
469 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
470 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
471 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
472 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
473 /* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */
474 latency = 300;
475 break;
476
477 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
478 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
479 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
480 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
481 case GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
482 /* See also SHADER_OPCODE_UNTYPED_ATOMIC */
483 latency = 14000;
484 break;
485
486 default:
487 unreachable("Unknown data cache message");
488 }
489 break;
490
491 default:
492 unreachable("Unknown SFID");
493 }
494 break;
495
496 default:
497 /* 2 cycles:
498 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
499 *
500 * 16 cycles:
501 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
502 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
503 */
504 latency = 14;
505 break;
506 }
507 }
508
509 class instruction_scheduler {
510 public:
511 instruction_scheduler(backend_shader *s, int grf_count,
512 unsigned hw_reg_count, int block_count,
513 instruction_scheduler_mode mode)
514 {
515 this->bs = s;
516 this->mem_ctx = ralloc_context(NULL);
517 this->grf_count = grf_count;
518 this->hw_reg_count = hw_reg_count;
519 this->instructions.make_empty();
520 this->instructions_to_schedule = 0;
521 this->post_reg_alloc = (mode == SCHEDULE_POST);
522 this->mode = mode;
523 if (!post_reg_alloc) {
524 this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
525
526 this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
527 for (int i = 0; i < block_count; i++)
528 this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
529 BITSET_WORDS(grf_count));
530
531 this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
532 for (int i = 0; i < block_count; i++)
533 this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
534 BITSET_WORDS(grf_count));
535
536 this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
537 for (int i = 0; i < block_count; i++)
538 this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
539 BITSET_WORDS(hw_reg_count));
540
541 this->written = rzalloc_array(mem_ctx, bool, grf_count);
542
543 this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
544
545 this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
546 } else {
547 this->reg_pressure_in = NULL;
548 this->livein = NULL;
549 this->liveout = NULL;
550 this->hw_liveout = NULL;
551 this->written = NULL;
552 this->reads_remaining = NULL;
553 this->hw_reads_remaining = NULL;
554 }
555 }
556
557 ~instruction_scheduler()
558 {
559 ralloc_free(this->mem_ctx);
560 }
561 void add_barrier_deps(schedule_node *n);
562 void add_dep(schedule_node *before, schedule_node *after, int latency);
563 void add_dep(schedule_node *before, schedule_node *after);
564
565 void run(cfg_t *cfg);
566 void add_insts_from_block(bblock_t *block);
567 void compute_delays();
568 void compute_exits();
569 virtual void calculate_deps() = 0;
570 virtual schedule_node *choose_instruction_to_schedule() = 0;
571
572 /**
573 * Returns how many cycles it takes the instruction to issue.
574 *
575 * Instructions in gen hardware are handled one simd4 vector at a time,
576 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2
577 * cycles to dispatch and SIMD16 (compressed) instructions take 4.
578 */
579 virtual int issue_time(backend_instruction *inst) = 0;
580
581 virtual void count_reads_remaining(backend_instruction *inst) = 0;
582 virtual void setup_liveness(cfg_t *cfg) = 0;
583 virtual void update_register_pressure(backend_instruction *inst) = 0;
584 virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
585
586 void schedule_instructions(bblock_t *block);
587
588 void *mem_ctx;
589
590 bool post_reg_alloc;
591 int instructions_to_schedule;
592 int grf_count;
593 unsigned hw_reg_count;
594 int reg_pressure;
595 int block_idx;
596 exec_list instructions;
597 backend_shader *bs;
598
599 instruction_scheduler_mode mode;
600
601 /*
602 * The register pressure at the beginning of each basic block.
603 */
604
605 int *reg_pressure_in;
606
607 /*
608 * The virtual GRF's whose range overlaps the beginning of each basic block.
609 */
610
611 BITSET_WORD **livein;
612
613 /*
614 * The virtual GRF's whose range overlaps the end of each basic block.
615 */
616
617 BITSET_WORD **liveout;
618
619 /*
620 * The hardware GRF's whose range overlaps the end of each basic block.
621 */
622
623 BITSET_WORD **hw_liveout;
624
625 /*
626 * Whether we've scheduled a write for this virtual GRF yet.
627 */
628
629 bool *written;
630
631 /*
632 * How many reads we haven't scheduled for this virtual GRF yet.
633 */
634
635 int *reads_remaining;
636
637 /*
638 * How many reads we haven't scheduled for this hardware GRF yet.
639 */
640
641 int *hw_reads_remaining;
642 };
643
644 class fs_instruction_scheduler : public instruction_scheduler
645 {
646 public:
647 fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
648 int block_count,
649 instruction_scheduler_mode mode);
650 void calculate_deps();
651 bool is_compressed(fs_inst *inst);
652 schedule_node *choose_instruction_to_schedule();
653 int issue_time(backend_instruction *inst);
654 fs_visitor *v;
655
656 void count_reads_remaining(backend_instruction *inst);
657 void setup_liveness(cfg_t *cfg);
658 void update_register_pressure(backend_instruction *inst);
659 int get_register_pressure_benefit(backend_instruction *inst);
660 };
661
662 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
663 int grf_count, int hw_reg_count,
664 int block_count,
665 instruction_scheduler_mode mode)
666 : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
667 v(v)
668 {
669 }
670
671 static bool
672 is_src_duplicate(fs_inst *inst, int src)
673 {
674 for (int i = 0; i < src; i++)
675 if (inst->src[i].equals(inst->src[src]))
676 return true;
677
678 return false;
679 }
680
681 void
682 fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
683 {
684 fs_inst *inst = (fs_inst *)be;
685
686 if (!reads_remaining)
687 return;
688
689 for (int i = 0; i < inst->sources; i++) {
690 if (is_src_duplicate(inst, i))
691 continue;
692
693 if (inst->src[i].file == VGRF) {
694 reads_remaining[inst->src[i].nr]++;
695 } else if (inst->src[i].file == FIXED_GRF) {
696 if (inst->src[i].nr >= hw_reg_count)
697 continue;
698
699 for (unsigned j = 0; j < regs_read(inst, i); j++)
700 hw_reads_remaining[inst->src[i].nr + j]++;
701 }
702 }
703 }
704
705 void
706 fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
707 {
708 /* First, compute liveness on a per-GRF level using the in/out sets from
709 * liveness calculation.
710 */
711 for (int block = 0; block < cfg->num_blocks; block++) {
712 for (int i = 0; i < v->live_intervals->num_vars; i++) {
713 if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
714 int vgrf = v->live_intervals->vgrf_from_var[i];
715 if (!BITSET_TEST(livein[block], vgrf)) {
716 reg_pressure_in[block] += v->alloc.sizes[vgrf];
717 BITSET_SET(livein[block], vgrf);
718 }
719 }
720
721 if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
722 BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
723 }
724 }
725
726 /* Now, extend the live in/live out sets for when a range crosses a block
727 * boundary, which matches what our register allocator/interference code
728 * does to account for force_writemask_all and incompatible exec_mask's.
729 */
730 for (int block = 0; block < cfg->num_blocks - 1; block++) {
731 for (int i = 0; i < grf_count; i++) {
732 if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
733 v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
734 if (!BITSET_TEST(livein[block + 1], i)) {
735 reg_pressure_in[block + 1] += v->alloc.sizes[i];
736 BITSET_SET(livein[block + 1], i);
737 }
738
739 BITSET_SET(liveout[block], i);
740 }
741 }
742 }
743
744 int payload_last_use_ip[hw_reg_count];
745 v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
746
747 for (unsigned i = 0; i < hw_reg_count; i++) {
748 if (payload_last_use_ip[i] == -1)
749 continue;
750
751 for (int block = 0; block < cfg->num_blocks; block++) {
752 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
753 reg_pressure_in[block]++;
754
755 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
756 BITSET_SET(hw_liveout[block], i);
757 }
758 }
759 }
760
761 void
762 fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
763 {
764 fs_inst *inst = (fs_inst *)be;
765
766 if (!reads_remaining)
767 return;
768
769 if (inst->dst.file == VGRF) {
770 written[inst->dst.nr] = true;
771 }
772
773 for (int i = 0; i < inst->sources; i++) {
774 if (is_src_duplicate(inst, i))
775 continue;
776
777 if (inst->src[i].file == VGRF) {
778 reads_remaining[inst->src[i].nr]--;
779 } else if (inst->src[i].file == FIXED_GRF &&
780 inst->src[i].nr < hw_reg_count) {
781 for (unsigned off = 0; off < regs_read(inst, i); off++)
782 hw_reads_remaining[inst->src[i].nr + off]--;
783 }
784 }
785 }
786
787 int
788 fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
789 {
790 fs_inst *inst = (fs_inst *)be;
791 int benefit = 0;
792
793 if (inst->dst.file == VGRF) {
794 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
795 !written[inst->dst.nr])
796 benefit -= v->alloc.sizes[inst->dst.nr];
797 }
798
799 for (int i = 0; i < inst->sources; i++) {
800 if (is_src_duplicate(inst, i))
801 continue;
802
803 if (inst->src[i].file == VGRF &&
804 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
805 reads_remaining[inst->src[i].nr] == 1)
806 benefit += v->alloc.sizes[inst->src[i].nr];
807
808 if (inst->src[i].file == FIXED_GRF &&
809 inst->src[i].nr < hw_reg_count) {
810 for (unsigned off = 0; off < regs_read(inst, i); off++) {
811 int reg = inst->src[i].nr + off;
812 if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
813 hw_reads_remaining[reg] == 1) {
814 benefit++;
815 }
816 }
817 }
818 }
819
820 return benefit;
821 }
822
823 class vec4_instruction_scheduler : public instruction_scheduler
824 {
825 public:
826 vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
827 void calculate_deps();
828 schedule_node *choose_instruction_to_schedule();
829 int issue_time(backend_instruction *inst);
830 vec4_visitor *v;
831
832 void count_reads_remaining(backend_instruction *inst);
833 void setup_liveness(cfg_t *cfg);
834 void update_register_pressure(backend_instruction *inst);
835 int get_register_pressure_benefit(backend_instruction *inst);
836 };
837
838 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
839 int grf_count)
840 : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
841 v(v)
842 {
843 }
844
845 void
846 vec4_instruction_scheduler::count_reads_remaining(backend_instruction *)
847 {
848 }
849
850 void
851 vec4_instruction_scheduler::setup_liveness(cfg_t *)
852 {
853 }
854
855 void
856 vec4_instruction_scheduler::update_register_pressure(backend_instruction *)
857 {
858 }
859
860 int
861 vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *)
862 {
863 return 0;
864 }
865
866 schedule_node::schedule_node(backend_instruction *inst,
867 instruction_scheduler *sched)
868 {
869 const struct gen_device_info *devinfo = sched->bs->devinfo;
870
871 this->inst = inst;
872 this->child_array_size = 0;
873 this->children = NULL;
874 this->child_latency = NULL;
875 this->child_count = 0;
876 this->parent_count = 0;
877 this->unblocked_time = 0;
878 this->cand_generation = 0;
879 this->delay = 0;
880 this->exit = NULL;
881
882 /* We can't measure Gen6 timings directly but expect them to be much
883 * closer to Gen7 than Gen4.
884 */
885 if (!sched->post_reg_alloc)
886 this->latency = 1;
887 else if (devinfo->gen >= 6)
888 set_latency_gen7(devinfo->is_haswell);
889 else
890 set_latency_gen4();
891 }
892
893 void
894 instruction_scheduler::add_insts_from_block(bblock_t *block)
895 {
896 foreach_inst_in_block(backend_instruction, inst, block) {
897 schedule_node *n = new(mem_ctx) schedule_node(inst, this);
898
899 instructions.push_tail(n);
900 }
901
902 this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
903 }
904
905 /** Computation of the delay member of each node. */
906 void
907 instruction_scheduler::compute_delays()
908 {
909 foreach_in_list_reverse(schedule_node, n, &instructions) {
910 if (!n->child_count) {
911 n->delay = issue_time(n->inst);
912 } else {
913 for (int i = 0; i < n->child_count; i++) {
914 assert(n->children[i]->delay);
915 n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
916 }
917 }
918 }
919 }
920
921 void
922 instruction_scheduler::compute_exits()
923 {
924 /* Calculate a lower bound of the scheduling time of each node in the
925 * graph. This is analogous to the node's critical path but calculated
926 * from the top instead of from the bottom of the block.
927 */
928 foreach_in_list(schedule_node, n, &instructions) {
929 for (int i = 0; i < n->child_count; i++) {
930 n->children[i]->unblocked_time =
931 MAX2(n->children[i]->unblocked_time,
932 n->unblocked_time + issue_time(n->inst) + n->child_latency[i]);
933 }
934 }
935
936 /* Calculate the exit of each node by induction based on the exit nodes of
937 * its children. The preferred exit of a node is the one among the exit
938 * nodes of its children which can be unblocked first according to the
939 * optimistic unblocked time estimate calculated above.
940 */
941 foreach_in_list_reverse(schedule_node, n, &instructions) {
942 n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL);
943
944 for (int i = 0; i < n->child_count; i++) {
945 if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n))
946 n->exit = n->children[i]->exit;
947 }
948 }
949 }
950
951 /**
952 * Add a dependency between two instruction nodes.
953 *
954 * The @after node will be scheduled after @before. We will try to
955 * schedule it @latency cycles after @before, but no guarantees there.
956 */
957 void
958 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
959 int latency)
960 {
961 if (!before || !after)
962 return;
963
964 assert(before != after);
965
966 for (int i = 0; i < before->child_count; i++) {
967 if (before->children[i] == after) {
968 before->child_latency[i] = MAX2(before->child_latency[i], latency);
969 return;
970 }
971 }
972
973 if (before->child_array_size <= before->child_count) {
974 if (before->child_array_size < 16)
975 before->child_array_size = 16;
976 else
977 before->child_array_size *= 2;
978
979 before->children = reralloc(mem_ctx, before->children,
980 schedule_node *,
981 before->child_array_size);
982 before->child_latency = reralloc(mem_ctx, before->child_latency,
983 int, before->child_array_size);
984 }
985
986 before->children[before->child_count] = after;
987 before->child_latency[before->child_count] = latency;
988 before->child_count++;
989 after->parent_count++;
990 }
991
992 void
993 instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
994 {
995 if (!before)
996 return;
997
998 add_dep(before, after, before->latency);
999 }
1000
1001 static bool
1002 is_scheduling_barrier(const backend_instruction *inst)
1003 {
1004 return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
1005 inst->is_control_flow() ||
1006 inst->has_side_effects();
1007 }
1008
1009 /**
1010 * Sometimes we really want this node to execute after everything that
1011 * was before it and before everything that followed it. This adds
1012 * the deps to do so.
1013 */
1014 void
1015 instruction_scheduler::add_barrier_deps(schedule_node *n)
1016 {
1017 schedule_node *prev = (schedule_node *)n->prev;
1018 schedule_node *next = (schedule_node *)n->next;
1019
1020 if (prev) {
1021 while (!prev->is_head_sentinel()) {
1022 add_dep(prev, n, 0);
1023 if (is_scheduling_barrier(prev->inst))
1024 break;
1025 prev = (schedule_node *)prev->prev;
1026 }
1027 }
1028
1029 if (next) {
1030 while (!next->is_tail_sentinel()) {
1031 add_dep(n, next, 0);
1032 if (is_scheduling_barrier(next->inst))
1033 break;
1034 next = (schedule_node *)next->next;
1035 }
1036 }
1037 }
1038
1039 /* instruction scheduling needs to be aware of when an MRF write
1040 * actually writes 2 MRFs.
1041 */
1042 bool
1043 fs_instruction_scheduler::is_compressed(fs_inst *inst)
1044 {
1045 return inst->exec_size == 16;
1046 }
1047
1048 void
1049 fs_instruction_scheduler::calculate_deps()
1050 {
1051 /* Pre-register-allocation, this tracks the last write per VGRF offset.
1052 * After register allocation, reg_offsets are gone and we track individual
1053 * GRF registers.
1054 */
1055 schedule_node **last_grf_write;
1056 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
1057 schedule_node *last_conditional_mod[8] = {};
1058 schedule_node *last_accumulator_write = NULL;
1059 /* Fixed HW registers are assumed to be separate from the virtual
1060 * GRFs, so they can be tracked separately. We don't really write
1061 * to fixed GRFs much, so don't bother tracking them on a more
1062 * granular level.
1063 */
1064 schedule_node *last_fixed_grf_write = NULL;
1065
1066 last_grf_write = (schedule_node **)calloc(sizeof(schedule_node *), grf_count * 16);
1067 memset(last_mrf_write, 0, sizeof(last_mrf_write));
1068
1069 /* top-to-bottom dependencies: RAW and WAW. */
1070 foreach_in_list(schedule_node, n, &instructions) {
1071 fs_inst *inst = (fs_inst *)n->inst;
1072
1073 if (is_scheduling_barrier(inst))
1074 add_barrier_deps(n);
1075
1076 /* read-after-write deps. */
1077 for (int i = 0; i < inst->sources; i++) {
1078 if (inst->src[i].file == VGRF) {
1079 if (post_reg_alloc) {
1080 for (unsigned r = 0; r < regs_read(inst, i); r++)
1081 add_dep(last_grf_write[inst->src[i].nr + r], n);
1082 } else {
1083 for (unsigned r = 0; r < regs_read(inst, i); r++) {
1084 add_dep(last_grf_write[inst->src[i].nr * 16 +
1085 inst->src[i].offset / REG_SIZE + r], n);
1086 }
1087 }
1088 } else if (inst->src[i].file == FIXED_GRF) {
1089 if (post_reg_alloc) {
1090 for (unsigned r = 0; r < regs_read(inst, i); r++)
1091 add_dep(last_grf_write[inst->src[i].nr + r], n);
1092 } else {
1093 add_dep(last_fixed_grf_write, n);
1094 }
1095 } else if (inst->src[i].is_accumulator()) {
1096 add_dep(last_accumulator_write, n);
1097 } else if (inst->src[i].file == ARF) {
1098 add_barrier_deps(n);
1099 }
1100 }
1101
1102 if (inst->base_mrf != -1) {
1103 for (int i = 0; i < inst->mlen; i++) {
1104 /* It looks like the MRF regs are released in the send
1105 * instruction once it's sent, not when the result comes
1106 * back.
1107 */
1108 add_dep(last_mrf_write[inst->base_mrf + i], n);
1109 }
1110 }
1111
1112 if (const unsigned mask = inst->flags_read(v->devinfo)) {
1113 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1114
1115 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1116 if (mask & (1 << i))
1117 add_dep(last_conditional_mod[i], n);
1118 }
1119 }
1120
1121 if (inst->reads_accumulator_implicitly()) {
1122 add_dep(last_accumulator_write, n);
1123 }
1124
1125 /* write-after-write deps. */
1126 if (inst->dst.file == VGRF) {
1127 if (post_reg_alloc) {
1128 for (unsigned r = 0; r < regs_written(inst); r++) {
1129 add_dep(last_grf_write[inst->dst.nr + r], n);
1130 last_grf_write[inst->dst.nr + r] = n;
1131 }
1132 } else {
1133 for (unsigned r = 0; r < regs_written(inst); r++) {
1134 add_dep(last_grf_write[inst->dst.nr * 16 +
1135 inst->dst.offset / REG_SIZE + r], n);
1136 last_grf_write[inst->dst.nr * 16 +
1137 inst->dst.offset / REG_SIZE + r] = n;
1138 }
1139 }
1140 } else if (inst->dst.file == MRF) {
1141 int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
1142
1143 add_dep(last_mrf_write[reg], n);
1144 last_mrf_write[reg] = n;
1145 if (is_compressed(inst)) {
1146 if (inst->dst.nr & BRW_MRF_COMPR4)
1147 reg += 4;
1148 else
1149 reg++;
1150 add_dep(last_mrf_write[reg], n);
1151 last_mrf_write[reg] = n;
1152 }
1153 } else if (inst->dst.file == FIXED_GRF) {
1154 if (post_reg_alloc) {
1155 for (unsigned r = 0; r < regs_written(inst); r++)
1156 last_grf_write[inst->dst.nr + r] = n;
1157 } else {
1158 last_fixed_grf_write = n;
1159 }
1160 } else if (inst->dst.is_accumulator()) {
1161 add_dep(last_accumulator_write, n);
1162 last_accumulator_write = n;
1163 } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
1164 add_barrier_deps(n);
1165 }
1166
1167 if (inst->mlen > 0 && inst->base_mrf != -1) {
1168 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1169 add_dep(last_mrf_write[inst->base_mrf + i], n);
1170 last_mrf_write[inst->base_mrf + i] = n;
1171 }
1172 }
1173
1174 if (const unsigned mask = inst->flags_written()) {
1175 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1176
1177 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1178 if (mask & (1 << i)) {
1179 add_dep(last_conditional_mod[i], n, 0);
1180 last_conditional_mod[i] = n;
1181 }
1182 }
1183 }
1184
1185 if (inst->writes_accumulator_implicitly(v->devinfo) &&
1186 !inst->dst.is_accumulator()) {
1187 add_dep(last_accumulator_write, n);
1188 last_accumulator_write = n;
1189 }
1190 }
1191
1192 /* bottom-to-top dependencies: WAR */
1193 memset(last_grf_write, 0, sizeof(schedule_node *) * grf_count * 16);
1194 memset(last_mrf_write, 0, sizeof(last_mrf_write));
1195 memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
1196 last_accumulator_write = NULL;
1197 last_fixed_grf_write = NULL;
1198
1199 foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
1200 fs_inst *inst = (fs_inst *)n->inst;
1201
1202 /* write-after-read deps. */
1203 for (int i = 0; i < inst->sources; i++) {
1204 if (inst->src[i].file == VGRF) {
1205 if (post_reg_alloc) {
1206 for (unsigned r = 0; r < regs_read(inst, i); r++)
1207 add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
1208 } else {
1209 for (unsigned r = 0; r < regs_read(inst, i); r++) {
1210 add_dep(n, last_grf_write[inst->src[i].nr * 16 +
1211 inst->src[i].offset / REG_SIZE + r], 0);
1212 }
1213 }
1214 } else if (inst->src[i].file == FIXED_GRF) {
1215 if (post_reg_alloc) {
1216 for (unsigned r = 0; r < regs_read(inst, i); r++)
1217 add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
1218 } else {
1219 add_dep(n, last_fixed_grf_write, 0);
1220 }
1221 } else if (inst->src[i].is_accumulator()) {
1222 add_dep(n, last_accumulator_write, 0);
1223 } else if (inst->src[i].file == ARF) {
1224 add_barrier_deps(n);
1225 }
1226 }
1227
1228 if (inst->base_mrf != -1) {
1229 for (int i = 0; i < inst->mlen; i++) {
1230 /* It looks like the MRF regs are released in the send
1231 * instruction once it's sent, not when the result comes
1232 * back.
1233 */
1234 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
1235 }
1236 }
1237
1238 if (const unsigned mask = inst->flags_read(v->devinfo)) {
1239 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1240
1241 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1242 if (mask & (1 << i))
1243 add_dep(n, last_conditional_mod[i]);
1244 }
1245 }
1246
1247 if (inst->reads_accumulator_implicitly()) {
1248 add_dep(n, last_accumulator_write);
1249 }
1250
1251 /* Update the things this instruction wrote, so earlier reads
1252 * can mark this as WAR dependency.
1253 */
1254 if (inst->dst.file == VGRF) {
1255 if (post_reg_alloc) {
1256 for (unsigned r = 0; r < regs_written(inst); r++)
1257 last_grf_write[inst->dst.nr + r] = n;
1258 } else {
1259 for (unsigned r = 0; r < regs_written(inst); r++) {
1260 last_grf_write[inst->dst.nr * 16 +
1261 inst->dst.offset / REG_SIZE + r] = n;
1262 }
1263 }
1264 } else if (inst->dst.file == MRF) {
1265 int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
1266
1267 last_mrf_write[reg] = n;
1268
1269 if (is_compressed(inst)) {
1270 if (inst->dst.nr & BRW_MRF_COMPR4)
1271 reg += 4;
1272 else
1273 reg++;
1274
1275 last_mrf_write[reg] = n;
1276 }
1277 } else if (inst->dst.file == FIXED_GRF) {
1278 if (post_reg_alloc) {
1279 for (unsigned r = 0; r < regs_written(inst); r++)
1280 last_grf_write[inst->dst.nr + r] = n;
1281 } else {
1282 last_fixed_grf_write = n;
1283 }
1284 } else if (inst->dst.is_accumulator()) {
1285 last_accumulator_write = n;
1286 } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
1287 add_barrier_deps(n);
1288 }
1289
1290 if (inst->mlen > 0 && inst->base_mrf != -1) {
1291 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1292 last_mrf_write[inst->base_mrf + i] = n;
1293 }
1294 }
1295
1296 if (const unsigned mask = inst->flags_written()) {
1297 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1298
1299 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1300 if (mask & (1 << i))
1301 last_conditional_mod[i] = n;
1302 }
1303 }
1304
1305 if (inst->writes_accumulator_implicitly(v->devinfo)) {
1306 last_accumulator_write = n;
1307 }
1308 }
1309
1310 free(last_grf_write);
1311 }
1312
1313 void
1314 vec4_instruction_scheduler::calculate_deps()
1315 {
1316 schedule_node *last_grf_write[grf_count];
1317 schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
1318 schedule_node *last_conditional_mod = NULL;
1319 schedule_node *last_accumulator_write = NULL;
1320 /* Fixed HW registers are assumed to be separate from the virtual
1321 * GRFs, so they can be tracked separately. We don't really write
1322 * to fixed GRFs much, so don't bother tracking them on a more
1323 * granular level.
1324 */
1325 schedule_node *last_fixed_grf_write = NULL;
1326
1327 memset(last_grf_write, 0, sizeof(last_grf_write));
1328 memset(last_mrf_write, 0, sizeof(last_mrf_write));
1329
1330 /* top-to-bottom dependencies: RAW and WAW. */
1331 foreach_in_list(schedule_node, n, &instructions) {
1332 vec4_instruction *inst = (vec4_instruction *)n->inst;
1333
1334 if (is_scheduling_barrier(inst))
1335 add_barrier_deps(n);
1336
1337 /* read-after-write deps. */
1338 for (int i = 0; i < 3; i++) {
1339 if (inst->src[i].file == VGRF) {
1340 for (unsigned j = 0; j < regs_read(inst, i); ++j)
1341 add_dep(last_grf_write[inst->src[i].nr + j], n);
1342 } else if (inst->src[i].file == FIXED_GRF) {
1343 add_dep(last_fixed_grf_write, n);
1344 } else if (inst->src[i].is_accumulator()) {
1345 assert(last_accumulator_write);
1346 add_dep(last_accumulator_write, n);
1347 } else if (inst->src[i].file == ARF) {
1348 add_barrier_deps(n);
1349 }
1350 }
1351
1352 if (inst->reads_g0_implicitly())
1353 add_dep(last_fixed_grf_write, n);
1354
1355 if (!inst->is_send_from_grf()) {
1356 for (int i = 0; i < inst->mlen; i++) {
1357 /* It looks like the MRF regs are released in the send
1358 * instruction once it's sent, not when the result comes
1359 * back.
1360 */
1361 add_dep(last_mrf_write[inst->base_mrf + i], n);
1362 }
1363 }
1364
1365 if (inst->reads_flag()) {
1366 assert(last_conditional_mod);
1367 add_dep(last_conditional_mod, n);
1368 }
1369
1370 if (inst->reads_accumulator_implicitly()) {
1371 assert(last_accumulator_write);
1372 add_dep(last_accumulator_write, n);
1373 }
1374
1375 /* write-after-write deps. */
1376 if (inst->dst.file == VGRF) {
1377 for (unsigned j = 0; j < regs_written(inst); ++j) {
1378 add_dep(last_grf_write[inst->dst.nr + j], n);
1379 last_grf_write[inst->dst.nr + j] = n;
1380 }
1381 } else if (inst->dst.file == MRF) {
1382 add_dep(last_mrf_write[inst->dst.nr], n);
1383 last_mrf_write[inst->dst.nr] = n;
1384 } else if (inst->dst.file == FIXED_GRF) {
1385 last_fixed_grf_write = n;
1386 } else if (inst->dst.is_accumulator()) {
1387 add_dep(last_accumulator_write, n);
1388 last_accumulator_write = n;
1389 } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
1390 add_barrier_deps(n);
1391 }
1392
1393 if (inst->mlen > 0 && !inst->is_send_from_grf()) {
1394 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1395 add_dep(last_mrf_write[inst->base_mrf + i], n);
1396 last_mrf_write[inst->base_mrf + i] = n;
1397 }
1398 }
1399
1400 if (inst->writes_flag()) {
1401 add_dep(last_conditional_mod, n, 0);
1402 last_conditional_mod = n;
1403 }
1404
1405 if (inst->writes_accumulator_implicitly(v->devinfo) &&
1406 !inst->dst.is_accumulator()) {
1407 add_dep(last_accumulator_write, n);
1408 last_accumulator_write = n;
1409 }
1410 }
1411
1412 /* bottom-to-top dependencies: WAR */
1413 memset(last_grf_write, 0, sizeof(last_grf_write));
1414 memset(last_mrf_write, 0, sizeof(last_mrf_write));
1415 last_conditional_mod = NULL;
1416 last_accumulator_write = NULL;
1417 last_fixed_grf_write = NULL;
1418
1419 foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
1420 vec4_instruction *inst = (vec4_instruction *)n->inst;
1421
1422 /* write-after-read deps. */
1423 for (int i = 0; i < 3; i++) {
1424 if (inst->src[i].file == VGRF) {
1425 for (unsigned j = 0; j < regs_read(inst, i); ++j)
1426 add_dep(n, last_grf_write[inst->src[i].nr + j]);
1427 } else if (inst->src[i].file == FIXED_GRF) {
1428 add_dep(n, last_fixed_grf_write);
1429 } else if (inst->src[i].is_accumulator()) {
1430 add_dep(n, last_accumulator_write);
1431 } else if (inst->src[i].file == ARF) {
1432 add_barrier_deps(n);
1433 }
1434 }
1435
1436 if (!inst->is_send_from_grf()) {
1437 for (int i = 0; i < inst->mlen; i++) {
1438 /* It looks like the MRF regs are released in the send
1439 * instruction once it's sent, not when the result comes
1440 * back.
1441 */
1442 add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
1443 }
1444 }
1445
1446 if (inst->reads_flag()) {
1447 add_dep(n, last_conditional_mod);
1448 }
1449
1450 if (inst->reads_accumulator_implicitly()) {
1451 add_dep(n, last_accumulator_write);
1452 }
1453
1454 /* Update the things this instruction wrote, so earlier reads
1455 * can mark this as WAR dependency.
1456 */
1457 if (inst->dst.file == VGRF) {
1458 for (unsigned j = 0; j < regs_written(inst); ++j)
1459 last_grf_write[inst->dst.nr + j] = n;
1460 } else if (inst->dst.file == MRF) {
1461 last_mrf_write[inst->dst.nr] = n;
1462 } else if (inst->dst.file == FIXED_GRF) {
1463 last_fixed_grf_write = n;
1464 } else if (inst->dst.is_accumulator()) {
1465 last_accumulator_write = n;
1466 } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
1467 add_barrier_deps(n);
1468 }
1469
1470 if (inst->mlen > 0 && !inst->is_send_from_grf()) {
1471 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1472 last_mrf_write[inst->base_mrf + i] = n;
1473 }
1474 }
1475
1476 if (inst->writes_flag()) {
1477 last_conditional_mod = n;
1478 }
1479
1480 if (inst->writes_accumulator_implicitly(v->devinfo)) {
1481 last_accumulator_write = n;
1482 }
1483 }
1484 }
1485
1486 schedule_node *
1487 fs_instruction_scheduler::choose_instruction_to_schedule()
1488 {
1489 schedule_node *chosen = NULL;
1490
1491 if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
1492 int chosen_time = 0;
1493
1494 /* Of the instructions ready to execute or the closest to being ready,
1495 * choose the one most likely to unblock an early program exit, or
1496 * otherwise the oldest one.
1497 */
1498 foreach_in_list(schedule_node, n, &instructions) {
1499 if (!chosen ||
1500 exit_unblocked_time(n) < exit_unblocked_time(chosen) ||
1501 (exit_unblocked_time(n) == exit_unblocked_time(chosen) &&
1502 n->unblocked_time < chosen_time)) {
1503 chosen = n;
1504 chosen_time = n->unblocked_time;
1505 }
1506 }
1507 } else {
1508 /* Before register allocation, we don't care about the latencies of
1509 * instructions. All we care about is reducing live intervals of
1510 * variables so that we can avoid register spilling, or get SIMD16
1511 * shaders which naturally do a better job of hiding instruction
1512 * latency.
1513 */
1514 foreach_in_list(schedule_node, n, &instructions) {
1515 fs_inst *inst = (fs_inst *)n->inst;
1516
1517 if (!chosen) {
1518 chosen = n;
1519 continue;
1520 }
1521
1522 /* Most important: If we can definitely reduce register pressure, do
1523 * so immediately.
1524 */
1525 int register_pressure_benefit = get_register_pressure_benefit(n->inst);
1526 int chosen_register_pressure_benefit =
1527 get_register_pressure_benefit(chosen->inst);
1528
1529 if (register_pressure_benefit > 0 &&
1530 register_pressure_benefit > chosen_register_pressure_benefit) {
1531 chosen = n;
1532 continue;
1533 } else if (chosen_register_pressure_benefit > 0 &&
1534 (register_pressure_benefit <
1535 chosen_register_pressure_benefit)) {
1536 continue;
1537 }
1538
1539 if (mode == SCHEDULE_PRE_LIFO) {
1540 /* Prefer instructions that recently became available for
1541 * scheduling. These are the things that are most likely to
1542 * (eventually) make a variable dead and reduce register pressure.
1543 * Typical register pressure estimates don't work for us because
1544 * most of our pressure comes from texturing, where no single
1545 * instruction to schedule will make a vec4 value dead.
1546 */
1547 if (n->cand_generation > chosen->cand_generation) {
1548 chosen = n;
1549 continue;
1550 } else if (n->cand_generation < chosen->cand_generation) {
1551 continue;
1552 }
1553
1554 /* On MRF-using chips, prefer non-SEND instructions. If we don't
1555 * do this, then because we prefer instructions that just became
1556 * candidates, we'll end up in a pattern of scheduling a SEND,
1557 * then the MRFs for the next SEND, then the next SEND, then the
1558 * MRFs, etc., without ever consuming the results of a send.
1559 */
1560 if (v->devinfo->gen < 7) {
1561 fs_inst *chosen_inst = (fs_inst *)chosen->inst;
1562
1563 /* We use size_written > 4 * exec_size as our test for the kind
1564 * of send instruction to avoid -- only sends generate many
1565 * regs, and a single-result send is probably actually reducing
1566 * register pressure.
1567 */
1568 if (inst->size_written <= 4 * inst->exec_size &&
1569 chosen_inst->size_written > 4 * chosen_inst->exec_size) {
1570 chosen = n;
1571 continue;
1572 } else if (inst->size_written > chosen_inst->size_written) {
1573 continue;
1574 }
1575 }
1576 }
1577
1578 /* For instructions pushed on the cands list at the same time, prefer
1579 * the one with the highest delay to the end of the program. This is
1580 * most likely to have its values able to be consumed first (such as
1581 * for a large tree of lowered ubo loads, which appear reversed in
1582 * the instruction stream with respect to when they can be consumed).
1583 */
1584 if (n->delay > chosen->delay) {
1585 chosen = n;
1586 continue;
1587 } else if (n->delay < chosen->delay) {
1588 continue;
1589 }
1590
1591 /* Prefer the node most likely to unblock an early program exit.
1592 */
1593 if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) {
1594 chosen = n;
1595 continue;
1596 } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) {
1597 continue;
1598 }
1599
1600 /* If all other metrics are equal, we prefer the first instruction in
1601 * the list (program execution).
1602 */
1603 }
1604 }
1605
1606 return chosen;
1607 }
1608
1609 schedule_node *
1610 vec4_instruction_scheduler::choose_instruction_to_schedule()
1611 {
1612 schedule_node *chosen = NULL;
1613 int chosen_time = 0;
1614
1615 /* Of the instructions ready to execute or the closest to being ready,
1616 * choose the oldest one.
1617 */
1618 foreach_in_list(schedule_node, n, &instructions) {
1619 if (!chosen || n->unblocked_time < chosen_time) {
1620 chosen = n;
1621 chosen_time = n->unblocked_time;
1622 }
1623 }
1624
1625 return chosen;
1626 }
1627
1628 int
1629 fs_instruction_scheduler::issue_time(backend_instruction *inst)
1630 {
1631 const unsigned overhead = v->bank_conflict_cycles((fs_inst *)inst);
1632 if (is_compressed((fs_inst *)inst))
1633 return 4 + overhead;
1634 else
1635 return 2 + overhead;
1636 }
1637
1638 int
1639 vec4_instruction_scheduler::issue_time(backend_instruction *)
1640 {
1641 /* We always execute as two vec4s in parallel. */
1642 return 2;
1643 }
1644
1645 void
1646 instruction_scheduler::schedule_instructions(bblock_t *block)
1647 {
1648 const struct gen_device_info *devinfo = bs->devinfo;
1649 int time = 0;
1650 if (!post_reg_alloc)
1651 reg_pressure = reg_pressure_in[block->num];
1652 block_idx = block->num;
1653
1654 /* Remove non-DAG heads from the list. */
1655 foreach_in_list_safe(schedule_node, n, &instructions) {
1656 if (n->parent_count != 0)
1657 n->remove();
1658 }
1659
1660 unsigned cand_generation = 1;
1661 while (!instructions.is_empty()) {
1662 schedule_node *chosen = choose_instruction_to_schedule();
1663
1664 /* Schedule this instruction. */
1665 assert(chosen);
1666 chosen->remove();
1667 chosen->inst->exec_node::remove();
1668 block->instructions.push_tail(chosen->inst);
1669 instructions_to_schedule--;
1670
1671 if (!post_reg_alloc) {
1672 reg_pressure -= get_register_pressure_benefit(chosen->inst);
1673 update_register_pressure(chosen->inst);
1674 }
1675
1676 /* If we expected a delay for scheduling, then bump the clock to reflect
1677 * that. In reality, the hardware will switch to another hyperthread
1678 * and may not return to dispatching our thread for a while even after
1679 * we're unblocked. After this, we have the time when the chosen
1680 * instruction will start executing.
1681 */
1682 time = MAX2(time, chosen->unblocked_time);
1683
1684 /* Update the clock for how soon an instruction could start after the
1685 * chosen one.
1686 */
1687 time += issue_time(chosen->inst);
1688
1689 if (debug) {
1690 fprintf(stderr, "clock %4d, scheduled: ", time);
1691 bs->dump_instruction(chosen->inst);
1692 if (!post_reg_alloc)
1693 fprintf(stderr, "(register pressure %d)\n", reg_pressure);
1694 }
1695
1696 /* Now that we've scheduled a new instruction, some of its
1697 * children can be promoted to the list of instructions ready to
1698 * be scheduled. Update the children's unblocked time for this
1699 * DAG edge as we do so.
1700 */
1701 for (int i = chosen->child_count - 1; i >= 0; i--) {
1702 schedule_node *child = chosen->children[i];
1703
1704 child->unblocked_time = MAX2(child->unblocked_time,
1705 time + chosen->child_latency[i]);
1706
1707 if (debug) {
1708 fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
1709 bs->dump_instruction(child->inst);
1710 }
1711
1712 child->cand_generation = cand_generation;
1713 child->parent_count--;
1714 if (child->parent_count == 0) {
1715 if (debug) {
1716 fprintf(stderr, "\t\tnow available\n");
1717 }
1718 instructions.push_head(child);
1719 }
1720 }
1721 cand_generation++;
1722
1723 /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+
1724 * but it's more limited pre-gen6, so if we send something off to it then
1725 * the next math instruction isn't going to make progress until the first
1726 * is done.
1727 */
1728 if (devinfo->gen < 6 && chosen->inst->is_math()) {
1729 foreach_in_list(schedule_node, n, &instructions) {
1730 if (n->inst->is_math())
1731 n->unblocked_time = MAX2(n->unblocked_time,
1732 time + chosen->latency);
1733 }
1734 }
1735 }
1736
1737 assert(instructions_to_schedule == 0);
1738
1739 block->cycle_count = time;
1740 }
1741
1742 static unsigned get_cycle_count(cfg_t *cfg)
1743 {
1744 unsigned count = 0, multiplier = 1;
1745 foreach_block(block, cfg) {
1746 if (block->start()->opcode == BRW_OPCODE_DO)
1747 multiplier *= 10; /* assume that loops execute ~10 times */
1748
1749 count += block->cycle_count * multiplier;
1750
1751 if (block->end()->opcode == BRW_OPCODE_WHILE)
1752 multiplier /= 10;
1753 }
1754
1755 return count;
1756 }
1757
1758 void
1759 instruction_scheduler::run(cfg_t *cfg)
1760 {
1761 if (debug && !post_reg_alloc) {
1762 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
1763 post_reg_alloc);
1764 bs->dump_instructions();
1765 }
1766
1767 if (!post_reg_alloc)
1768 setup_liveness(cfg);
1769
1770 foreach_block(block, cfg) {
1771 if (reads_remaining) {
1772 memset(reads_remaining, 0,
1773 grf_count * sizeof(*reads_remaining));
1774 memset(hw_reads_remaining, 0,
1775 hw_reg_count * sizeof(*hw_reads_remaining));
1776 memset(written, 0, grf_count * sizeof(*written));
1777
1778 foreach_inst_in_block(fs_inst, inst, block)
1779 count_reads_remaining(inst);
1780 }
1781
1782 add_insts_from_block(block);
1783
1784 calculate_deps();
1785
1786 compute_delays();
1787 compute_exits();
1788
1789 schedule_instructions(block);
1790 }
1791
1792 if (debug && !post_reg_alloc) {
1793 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
1794 post_reg_alloc);
1795 bs->dump_instructions();
1796 }
1797
1798 cfg->cycle_count = get_cycle_count(cfg);
1799 }
1800
1801 void
1802 fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
1803 {
1804 if (mode != SCHEDULE_POST)
1805 calculate_live_intervals();
1806
1807 int grf_count;
1808 if (mode == SCHEDULE_POST)
1809 grf_count = grf_used;
1810 else
1811 grf_count = alloc.count;
1812
1813 fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
1814 cfg->num_blocks, mode);
1815 sched.run(cfg);
1816
1817 invalidate_live_intervals();
1818 }
1819
1820 void
1821 vec4_visitor::opt_schedule_instructions()
1822 {
1823 vec4_instruction_scheduler sched(this, prog_data->total_grf);
1824 sched.run(cfg);
1825
1826 invalidate_live_intervals();
1827 }