2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "compiler/nir/nir_builder.h"
26 #include "util/u_math.h"
27 #include "util/bitscan.h"
30 dup_mem_intrinsic(nir_builder
*b
, nir_intrinsic_instr
*intrin
,
31 nir_ssa_def
*store_src
, int offset
,
32 unsigned num_components
, unsigned bit_size
,
35 const nir_intrinsic_info
*info
= &nir_intrinsic_infos
[intrin
->intrinsic
];
37 nir_intrinsic_instr
*dup
=
38 nir_intrinsic_instr_create(b
->shader
, intrin
->intrinsic
);
40 nir_src
*intrin_offset_src
= nir_get_io_offset_src(intrin
);
41 for (unsigned i
= 0; i
< info
->num_srcs
; i
++) {
42 assert(intrin
->src
[i
].is_ssa
);
43 if (i
== 0 && store_src
) {
44 assert(!info
->has_dest
);
45 assert(&intrin
->src
[i
] != intrin_offset_src
);
46 dup
->src
[i
] = nir_src_for_ssa(store_src
);
47 } else if (&intrin
->src
[i
] == intrin_offset_src
) {
48 dup
->src
[i
] = nir_src_for_ssa(nir_iadd_imm(b
, intrin
->src
[i
].ssa
,
51 dup
->src
[i
] = nir_src_for_ssa(intrin
->src
[i
].ssa
);
55 dup
->num_components
= num_components
;
57 for (unsigned i
= 0; i
< info
->num_indices
; i
++)
58 dup
->const_index
[i
] = intrin
->const_index
[i
];
60 nir_intrinsic_set_align(dup
, align
, 0);
63 assert(intrin
->dest
.is_ssa
);
64 nir_ssa_dest_init(&dup
->instr
, &dup
->dest
,
65 num_components
, bit_size
,
66 intrin
->dest
.ssa
.name
);
68 nir_intrinsic_set_write_mask(dup
, (1 << num_components
) - 1);
71 nir_builder_instr_insert(b
, &dup
->instr
);
73 return info
->has_dest
? &dup
->dest
.ssa
: NULL
;
77 lower_mem_load_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
)
79 assert(intrin
->dest
.is_ssa
);
80 if (intrin
->dest
.ssa
.bit_size
== 32)
83 const unsigned bit_size
= intrin
->dest
.ssa
.bit_size
;
84 const unsigned num_components
= intrin
->dest
.ssa
.num_components
;
85 const unsigned bytes_read
= num_components
* (bit_size
/ 8);
86 const unsigned align
= nir_intrinsic_align(intrin
);
88 nir_ssa_def
*result
[NIR_MAX_VEC_COMPONENTS
] = { NULL
, };
90 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
91 if (bit_size
< 32 && nir_src_is_const(*offset_src
)) {
92 /* The offset is constant so we can use a 32-bit load and just shift it
95 const int load_offset
= nir_src_as_uint(*offset_src
) % 4;
96 assert(load_offset
% (bit_size
/ 8) == 0);
97 const unsigned load_comps32
= DIV_ROUND_UP(bytes_read
+ load_offset
, 4);
98 /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
99 * we offset into a component with load_offset.
101 assert(load_comps32
<= 3);
103 nir_ssa_def
*load
= dup_mem_intrinsic(b
, intrin
, NULL
, -load_offset
,
104 load_comps32
, 32, 4);
105 nir_ssa_def
*unpacked
[3];
106 for (unsigned i
= 0; i
< load_comps32
; i
++)
107 unpacked
[i
] = nir_unpack_bits(b
, nir_channel(b
, load
, i
), bit_size
);
109 assert(load_offset
% (bit_size
/ 8) == 0);
110 const unsigned divisor
= 32 / bit_size
;
112 for (unsigned i
= 0; i
< num_components
; i
++) {
113 unsigned load_i
= i
+ load_offset
/ (bit_size
/ 8);
114 result
[i
] = nir_channel(b
, unpacked
[load_i
/ divisor
],
118 /* Otherwise, we have to break it into smaller loads */
119 unsigned res_idx
= 0;
121 while (load_offset
< bytes_read
) {
122 const unsigned bytes_left
= bytes_read
- load_offset
;
123 unsigned load_bit_size
, load_comps
;
126 /* Choose a byte, word, or dword */
127 load_bit_size
= util_next_power_of_two(MIN2(bytes_left
, 4)) * 8;
129 assert(load_offset
% 4 == 0);
131 load_comps
= DIV_ROUND_UP(MIN2(bytes_left
, 16), 4);
134 nir_ssa_def
*load
= dup_mem_intrinsic(b
, intrin
, NULL
, load_offset
,
135 load_comps
, load_bit_size
,
138 nir_ssa_def
*unpacked
= nir_bitcast_vector(b
, load
, bit_size
);
139 for (unsigned i
= 0; i
< unpacked
->num_components
; i
++) {
140 if (res_idx
< num_components
)
141 result
[res_idx
++] = nir_channel(b
, unpacked
, i
);
144 load_offset
+= load_comps
* (load_bit_size
/ 8);
148 nir_ssa_def
*vec_result
= nir_vec(b
, result
, num_components
);
149 nir_ssa_def_rewrite_uses(&intrin
->dest
.ssa
,
150 nir_src_for_ssa(vec_result
));
151 nir_instr_remove(&intrin
->instr
);
157 lower_mem_store_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
)
159 assert(intrin
->src
[0].is_ssa
);
160 nir_ssa_def
*value
= intrin
->src
[0].ssa
;
162 assert(intrin
->num_components
== value
->num_components
);
163 const unsigned bit_size
= value
->bit_size
;
164 const unsigned num_components
= intrin
->num_components
;
165 const unsigned bytes_written
= num_components
* (bit_size
/ 8);
166 const unsigned align_mul
= nir_intrinsic_align_mul(intrin
);
167 const unsigned align_offset
= nir_intrinsic_align_offset(intrin
);
168 const unsigned align
= nir_intrinsic_align(intrin
);
170 nir_component_mask_t writemask
= nir_intrinsic_write_mask(intrin
);
171 assert(writemask
< (1 << num_components
));
173 if ((value
->bit_size
<= 32 && num_components
== 1) ||
174 (value
->bit_size
== 32 && writemask
== (1 << num_components
) - 1))
177 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
178 const bool offset_is_const
= nir_src_is_const(*offset_src
);
179 const unsigned const_offset
=
180 offset_is_const
? nir_src_as_uint(*offset_src
) : 0;
182 assert(num_components
* (bit_size
/ 8) <= 32);
183 uint32_t byte_mask
= 0;
184 for (unsigned i
= 0; i
< num_components
; i
++) {
185 if (writemask
& (1 << i
))
186 byte_mask
|= ((1 << (bit_size
/ 8)) - 1) << i
* (bit_size
/ 8);
190 const int start
= ffs(byte_mask
) - 1;
191 assert(start
% (bit_size
/ 8) == 0);
194 for (end
= start
+ 1; end
< bytes_written
; end
++) {
195 if (!(byte_mask
& (1 << end
)))
198 /* The size of the current contiguous chunk in bytes */
199 const unsigned chunk_bytes
= end
- start
;
201 const bool is_dword_aligned
=
202 (align_mul
>= 4 && (align_offset
+ start
) % 4 == 0) ||
203 (offset_is_const
&& (start
+ const_offset
) % 4 == 0);
205 unsigned store_comps
, store_bit_size
, store_align
;
206 if (chunk_bytes
>= 4 && is_dword_aligned
) {
207 store_align
= MAX2(align
, 4);
209 store_comps
= MIN2(chunk_bytes
, 16) / 4;
213 store_bit_size
= MIN2(chunk_bytes
, 4) * 8;
214 /* The bit size must be a power of two */
215 if (store_bit_size
== 24)
219 const unsigned store_bytes
= store_comps
* (store_bit_size
/ 8);
220 assert(store_bytes
% (bit_size
/ 8) == 0);
221 const unsigned store_first_src_comp
= start
/ (bit_size
/ 8);
222 const unsigned store_src_comps
= store_bytes
/ (bit_size
/ 8);
223 assert(store_first_src_comp
+ store_src_comps
<= num_components
);
225 unsigned src_swiz
[4] = { 0, };
226 for (unsigned i
= 0; i
< store_src_comps
; i
++)
227 src_swiz
[i
] = store_first_src_comp
+ i
;
228 nir_ssa_def
*store_value
=
229 nir_swizzle(b
, value
, src_swiz
, store_src_comps
, false);
230 nir_ssa_def
*packed
= nir_bitcast_vector(b
, store_value
, store_bit_size
);
232 dup_mem_intrinsic(b
, intrin
, packed
, start
,
233 store_comps
, store_bit_size
, store_align
);
235 byte_mask
&= ~(((1u << store_bytes
) - 1) << start
);
238 nir_instr_remove(&intrin
->instr
);
244 lower_mem_access_bit_sizes_impl(nir_function_impl
*impl
)
246 bool progress
= false;
249 nir_builder_init(&b
, impl
);
251 nir_foreach_block(block
, impl
) {
252 nir_foreach_instr_safe(instr
, block
) {
253 if (instr
->type
!= nir_instr_type_intrinsic
)
256 b
.cursor
= nir_after_instr(instr
);
258 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
259 switch (intrin
->intrinsic
) {
260 case nir_intrinsic_load_global
:
261 case nir_intrinsic_load_ssbo
:
262 case nir_intrinsic_load_shared
:
263 if (lower_mem_load_bit_size(&b
, intrin
))
267 case nir_intrinsic_store_global
:
268 case nir_intrinsic_store_ssbo
:
269 case nir_intrinsic_store_shared
:
270 if (lower_mem_store_bit_size(&b
, intrin
))
281 nir_metadata_preserve(impl
, nir_metadata_block_index
|
282 nir_metadata_dominance
);
289 * This pass loads arbitrary SSBO and shared memory load/store operations to
290 * intrinsics which are natively handleable by GEN hardware. In particular,
291 * we have two general types of memory load/store messages:
293 * - Untyped surface read/write: These can load/store between one and four
294 * dword components to/from a dword-aligned offset.
296 * - Byte scattered read/write: These can load/store a single byte, word, or
297 * dword scalar to/from an unaligned byte offset.
299 * Neither type of message can do a write-masked store. This pass converts
300 * all nir load/store intrinsics into a series of either 8 or 32-bit
301 * load/store intrinsics with a number of components that we can directly
302 * handle in hardware and with a trivial write-mask.
305 brw_nir_lower_mem_access_bit_sizes(nir_shader
*shader
)
307 bool progress
= false;
309 nir_foreach_function(func
, shader
) {
310 if (func
->impl
&& lower_mem_access_bit_sizes_impl(func
->impl
))