2 * Copyright © 2019 Google, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 /* Lowering for amul instructions, for drivers that support imul24.
28 * This pass will analyze indirect derefs, and convert corresponding
29 * amul instructions to either imul or imul24, depending on the
32 * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
33 * that are either too large, or might be too large (unknown size)
36 * 2) Loop thru looking at all the intrinsics, finding dereferences of
37 * large variables, and recursively replacing all amul instructions
40 * 3) Finally loop again thru all instructions replacing any remaining
41 * amul with imul24. At this point any remaining amul instructions
42 * are not involved in calculating an offset into a large variable,
43 * thanks to the 2nd step, so they can be safely replace with imul24.
45 * Using two passes over all the instructions lets us handle the case
46 * where, due to CSE, an amul is used to calculate an offset into both
47 * a large and small variable.
51 int (*type_size
)(const struct glsl_type
*, bool);
53 /* Tables of UBOs and SSBOs mapping driver_location/base whether
54 * they are too large to use imul24:
59 /* for cases that we cannot determine UBO/SSBO index, track if *any*
60 * UBO/SSBO is too large for imul24:
68 /* Lower 'amul's in offset src of large variables to 'imul': */
70 lower_large_src(nir_src
*src
, void *s
)
72 lower_state
*state
= s
;
76 nir_instr
*parent
= src
->ssa
->parent_instr
;
78 /* No need to visit instructions we've already visited.. this also
79 * avoids infinite recursion when phi's are involved:
81 if (parent
->pass_flags
)
84 bool progress
= nir_foreach_src(parent
, lower_large_src
, state
);
86 if (parent
->type
== nir_instr_type_alu
) {
87 nir_alu_instr
*alu
= nir_instr_as_alu(parent
);
88 if (alu
->op
== nir_op_amul
) {
89 alu
->op
= nir_op_imul
;
94 parent
->pass_flags
= 1;
100 large_ubo(lower_state
*state
, nir_src src
)
102 if (!nir_src_is_const(src
))
103 return state
->has_large_ubo
;
104 unsigned idx
= nir_src_as_uint(src
);
105 assert(idx
< state
->max_slot
);
106 return state
->large_ubos
[idx
];
110 large_ssbo(lower_state
*state
, nir_src src
)
112 if (!nir_src_is_const(src
))
113 return state
->has_large_ssbo
;
114 unsigned idx
= nir_src_as_uint(src
);
115 assert(idx
< state
->max_slot
);
116 return state
->large_ssbos
[idx
];
120 lower_intrinsic(lower_state
*state
, nir_intrinsic_instr
*intr
)
122 switch (intr
->intrinsic
) {
123 case nir_intrinsic_load_ubo
:
124 //# src[] = { buffer_index, offset }.
125 if (large_ubo(state
, intr
->src
[0]))
126 return lower_large_src(&intr
->src
[1], state
);
129 case nir_intrinsic_load_ssbo
:
130 //# src[] = { buffer_index, offset }.
131 if (large_ssbo(state
, intr
->src
[0]))
132 return lower_large_src(&intr
->src
[1], state
);
135 case nir_intrinsic_store_ssbo
:
136 //# src[] = { value, block_index, offset }
137 if (large_ssbo(state
, intr
->src
[1]))
138 return lower_large_src(&intr
->src
[2], state
);
141 case nir_intrinsic_ssbo_atomic_add
:
142 case nir_intrinsic_ssbo_atomic_imin
:
143 case nir_intrinsic_ssbo_atomic_umin
:
144 case nir_intrinsic_ssbo_atomic_imax
:
145 case nir_intrinsic_ssbo_atomic_umax
:
146 case nir_intrinsic_ssbo_atomic_and
:
147 case nir_intrinsic_ssbo_atomic_or
:
148 case nir_intrinsic_ssbo_atomic_xor
:
149 case nir_intrinsic_ssbo_atomic_exchange
:
150 case nir_intrinsic_ssbo_atomic_comp_swap
:
151 case nir_intrinsic_ssbo_atomic_fadd
:
152 case nir_intrinsic_ssbo_atomic_fmin
:
153 case nir_intrinsic_ssbo_atomic_fmax
:
154 case nir_intrinsic_ssbo_atomic_fcomp_swap
:
158 if (large_ssbo(state
, intr
->src
[0]))
159 return lower_large_src(&intr
->src
[1], state
);
162 case nir_intrinsic_global_atomic_add
:
163 case nir_intrinsic_global_atomic_imin
:
164 case nir_intrinsic_global_atomic_umin
:
165 case nir_intrinsic_global_atomic_imax
:
166 case nir_intrinsic_global_atomic_umax
:
167 case nir_intrinsic_global_atomic_and
:
168 case nir_intrinsic_global_atomic_or
:
169 case nir_intrinsic_global_atomic_xor
:
170 case nir_intrinsic_global_atomic_exchange
:
171 case nir_intrinsic_global_atomic_comp_swap
:
172 case nir_intrinsic_global_atomic_fadd
:
173 case nir_intrinsic_global_atomic_fmin
:
174 case nir_intrinsic_global_atomic_fmax
:
175 case nir_intrinsic_global_atomic_fcomp_swap
:
176 /* just assume we that 24b is not sufficient: */
177 return lower_large_src(&intr
->src
[0], state
);
179 /* These should all be small enough to unconditionally use imul24: */
180 case nir_intrinsic_shared_atomic_add
:
181 case nir_intrinsic_shared_atomic_imin
:
182 case nir_intrinsic_shared_atomic_umin
:
183 case nir_intrinsic_shared_atomic_imax
:
184 case nir_intrinsic_shared_atomic_umax
:
185 case nir_intrinsic_shared_atomic_and
:
186 case nir_intrinsic_shared_atomic_or
:
187 case nir_intrinsic_shared_atomic_xor
:
188 case nir_intrinsic_shared_atomic_exchange
:
189 case nir_intrinsic_shared_atomic_comp_swap
:
190 case nir_intrinsic_shared_atomic_fadd
:
191 case nir_intrinsic_shared_atomic_fmin
:
192 case nir_intrinsic_shared_atomic_fmax
:
193 case nir_intrinsic_shared_atomic_fcomp_swap
:
194 case nir_intrinsic_load_uniform
:
195 case nir_intrinsic_load_input
:
196 case nir_intrinsic_load_output
:
197 case nir_intrinsic_store_output
:
204 lower_instr(lower_state
*state
, nir_instr
*instr
)
206 bool progress
= false;
208 if (instr
->type
== nir_instr_type_intrinsic
) {
209 progress
|= lower_intrinsic(state
, nir_instr_as_intrinsic(instr
));
216 is_large(lower_state
*state
, nir_variable
*var
)
218 const struct glsl_type
*type
= glsl_without_array(var
->type
);
219 unsigned size
= state
->type_size(type
, false);
221 /* if size is not known (ie. VLA) then assume the worst: */
225 return size
>= (1 << 23);
229 nir_lower_amul(nir_shader
*shader
,
230 int (*type_size
)(const struct glsl_type
*, bool))
232 assert(shader
->options
->has_imul24
);
235 /* uniforms list actually includes ubo's and ssbo's: */
238 nir_foreach_variable (var
, &shader
->uniforms
) {
239 if (!(var
->data
.mode
& (nir_var_mem_ubo
| nir_var_mem_ssbo
)))
242 int base
= var
->data
.binding
;
243 int size
= MAX2(1, glsl_array_size(var
->type
));
245 max_slot
= MAX2(max_slot
, base
+ size
);
248 NIR_VLA_FILL(bool, large_ubos
, max_slot
, 0);
249 NIR_VLA_FILL(bool, large_ssbos
, max_slot
, 0);
251 lower_state state
= {
252 .type_size
= type_size
,
253 .large_ubos
= large_ubos
,
254 .large_ssbos
= large_ssbos
,
255 .max_slot
= max_slot
,
258 /* Figure out which UBOs or SSBOs are large enough to be
259 * disqualified from imul24:
261 nir_foreach_variable(var
, &shader
->uniforms
) {
262 if (var
->data
.mode
== nir_var_mem_ubo
) {
263 if (is_large(&state
, var
)) {
264 state
.has_large_ubo
= true;
265 unsigned size
= MAX2(1, glsl_array_size(var
->type
));
266 for (unsigned i
= 0; i
< size
; i
++)
267 state
.large_ubos
[var
->data
.binding
+ i
] = true;
269 } else if (var
->data
.mode
== nir_var_mem_ssbo
) {
270 if (is_large(&state
, var
)) {
271 state
.has_large_ssbo
= true;
272 unsigned size
= MAX2(1, glsl_array_size(var
->type
));
273 for (unsigned i
= 0; i
< size
; i
++)
274 state
.large_ssbos
[var
->data
.binding
+ i
] = true;
279 /* clear pass flags: */
280 nir_foreach_function(function
, shader
) {
281 nir_function_impl
*impl
= function
->impl
;
285 nir_foreach_block(block
, impl
) {
286 nir_foreach_instr(instr
, block
) {
287 instr
->pass_flags
= 0;
292 bool progress
= false;
293 nir_foreach_function(function
, shader
) {
294 nir_function_impl
*impl
= function
->impl
;
299 nir_foreach_block(block
, impl
) {
300 nir_foreach_instr(instr
, block
) {
301 progress
|= lower_instr(&state
, instr
);
306 /* At this point, all 'amul's used in calculating an offset into
307 * a large variable have been replaced with 'imul'. So remaining
308 * 'amul's can be replaced with 'imul24':
310 nir_foreach_function(function
, shader
) {
311 nir_function_impl
*impl
= function
->impl
;
316 nir_foreach_block(block
, impl
) {
317 nir_foreach_instr(instr
, block
) {
318 if (instr
->type
!= nir_instr_type_alu
)
321 nir_alu_instr
*alu
= nir_instr_as_alu(instr
);
322 if (alu
->op
!= nir_op_amul
)
325 alu
->op
= nir_op_imul24
;
330 nir_metadata_preserve(impl
, nir_metadata_block_index
|
331 nir_metadata_dominance
);