1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Rob Clark <robclark@freedesktop.org>
29 #include "util/u_math.h"
36 * Calculates weighted instruction depth, ie. the sum of # of needed
37 * instructions plus delay slots back to original input (ie INPUT or
38 * CONST). That is to say, an instructions depth is:
42 * // for each src register:
43 * foreach (src in instr->regs[1..n])
44 * d = max(d, delayslots(src->instr, n) + depth(src->instr));
48 * After an instruction's depth is calculated, it is inserted into the
49 * blocks depth sorted list, which is used by the scheduling pass.
52 /* calculate required # of delay slots between the instruction that
53 * assigns a value and the one that consumes
55 int ir3_delayslots(struct ir3_instruction
*assigner
,
56 struct ir3_instruction
*consumer
, unsigned n
)
58 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
59 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
60 * handled with sync bits
63 if (is_meta(assigner
))
66 if (writes_addr(assigner
))
69 /* handled via sync flags: */
70 if (is_sfu(assigner
) || is_tex(assigner
))
73 /* assigner must be alu: */
74 if (is_flow(consumer
) || is_sfu(consumer
) || is_tex(consumer
)) {
76 } else if ((consumer
->category
== 3) &&
77 is_mad(consumer
->opc
) && (n
== 2)) {
78 /* special case, 3rd src to cat3 not required on first cycle */
85 static void insert_by_depth(struct ir3_instruction
*instr
)
87 struct ir3_block
*block
= instr
->block
;
88 struct ir3_instruction
*n
= block
->head
;
89 struct ir3_instruction
*p
= NULL
;
91 while (n
&& (n
!= instr
) && (n
->depth
> instr
->depth
)) {
103 static void ir3_instr_depth(struct ir3_instruction
*instr
)
107 /* if we've already visited this instruction, bail now: */
108 if (ir3_instr_check_mark(instr
))
113 for (i
= 1; i
< instr
->regs_count
; i
++) {
114 struct ir3_register
*src
= instr
->regs
[i
];
115 if (src
->flags
& IR3_REG_SSA
) {
118 /* visit child to compute it's depth: */
119 ir3_instr_depth(src
->instr
);
121 sd
= ir3_delayslots(src
->instr
, instr
, i
-1) +
124 instr
->depth
= MAX2(instr
->depth
, sd
);
128 /* meta-instructions don't add cycles, other than PHI.. which
129 * might translate to a real instruction..
131 * well, not entirely true, fan-in/out, etc might need to need
132 * to generate some extra mov's in edge cases, etc.. probably
133 * we might want to do depth calculation considering the worst
139 insert_by_depth(instr
);
142 void ir3_block_depth(struct ir3_block
*block
)
148 ir3_clear_mark(block
->shader
);
149 for (i
= 0; i
< block
->noutputs
; i
++)
150 if (block
->outputs
[i
])
151 ir3_instr_depth(block
->outputs
[i
]);
153 /* at this point, any unvisited input is unused: */
154 for (i
= 0; i
< block
->ninputs
; i
++) {
155 struct ir3_instruction
*in
= block
->inputs
[i
];
156 if (in
&& !ir3_instr_check_mark(in
))
157 block
->inputs
[i
] = NULL
;