2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "compiler/nir/nir_builder.h"
26 #include "util/u_math.h"
27 #include "util/bitscan.h"
30 dup_mem_intrinsic(nir_builder
*b
, nir_intrinsic_instr
*intrin
,
31 nir_ssa_def
*store_src
, int offset
,
32 unsigned num_components
, unsigned bit_size
,
35 const nir_intrinsic_info
*info
= &nir_intrinsic_infos
[intrin
->intrinsic
];
37 nir_intrinsic_instr
*dup
=
38 nir_intrinsic_instr_create(b
->shader
, intrin
->intrinsic
);
40 nir_src
*intrin_offset_src
= nir_get_io_offset_src(intrin
);
41 for (unsigned i
= 0; i
< info
->num_srcs
; i
++) {
42 assert(intrin
->src
[i
].is_ssa
);
43 if (i
== 0 && store_src
) {
44 assert(!info
->has_dest
);
45 assert(&intrin
->src
[i
] != intrin_offset_src
);
46 dup
->src
[i
] = nir_src_for_ssa(store_src
);
47 } else if (&intrin
->src
[i
] == intrin_offset_src
) {
48 dup
->src
[i
] = nir_src_for_ssa(nir_iadd_imm(b
, intrin
->src
[i
].ssa
,
51 dup
->src
[i
] = nir_src_for_ssa(intrin
->src
[i
].ssa
);
55 dup
->num_components
= num_components
;
57 for (unsigned i
= 0; i
< info
->num_indices
; i
++)
58 dup
->const_index
[i
] = intrin
->const_index
[i
];
60 nir_intrinsic_set_align(dup
, align
, 0);
63 assert(intrin
->dest
.is_ssa
);
64 nir_ssa_dest_init(&dup
->instr
, &dup
->dest
,
65 num_components
, bit_size
,
66 intrin
->dest
.ssa
.name
);
68 nir_intrinsic_set_write_mask(dup
, (1 << num_components
) - 1);
71 nir_builder_instr_insert(b
, &dup
->instr
);
73 return info
->has_dest
? &dup
->dest
.ssa
: NULL
;
77 lower_mem_load_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
)
79 assert(intrin
->dest
.is_ssa
);
80 if (intrin
->dest
.ssa
.bit_size
== 32)
83 const unsigned bit_size
= intrin
->dest
.ssa
.bit_size
;
84 const unsigned num_components
= intrin
->dest
.ssa
.num_components
;
85 const unsigned bytes_read
= num_components
* (bit_size
/ 8);
86 const unsigned align
= nir_intrinsic_align(intrin
);
88 nir_ssa_def
*result
[NIR_MAX_VEC_COMPONENTS
] = { NULL
, };
90 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
91 if (bit_size
< 32 && nir_src_is_const(*offset_src
)) {
92 /* The offset is constant so we can use a 32-bit load and just shift it
95 const int load_offset
= nir_src_as_uint(*offset_src
) % 4;
96 assert(load_offset
% (bit_size
/ 8) == 0);
97 const unsigned load_comps32
= DIV_ROUND_UP(bytes_read
+ load_offset
, 4);
98 /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
99 * we offset into a component with load_offset.
101 assert(load_comps32
<= 3);
103 nir_ssa_def
*load
= dup_mem_intrinsic(b
, intrin
, NULL
, -load_offset
,
104 load_comps32
, 32, 4);
105 nir_ssa_def
*unpacked
[3];
106 for (unsigned i
= 0; i
< load_comps32
; i
++)
107 unpacked
[i
] = nir_unpack_bits(b
, nir_channel(b
, load
, i
), bit_size
);
109 assert(load_offset
% (bit_size
/ 8) == 0);
110 const unsigned divisor
= 32 / bit_size
;
112 for (unsigned i
= 0; i
< num_components
; i
++) {
113 unsigned load_i
= i
+ load_offset
/ (bit_size
/ 8);
114 result
[i
] = nir_channel(b
, unpacked
[load_i
/ divisor
],
118 /* Otherwise, we have to break it into smaller loads */
119 unsigned res_idx
= 0;
121 while (load_offset
< bytes_read
) {
122 const unsigned bytes_left
= bytes_read
- load_offset
;
123 unsigned load_bit_size
, load_comps
;
126 /* Choose a byte, word, or dword */
127 load_bit_size
= util_next_power_of_two(MIN2(bytes_left
, 4)) * 8;
129 assert(load_offset
% 4 == 0);
131 load_comps
= DIV_ROUND_UP(MIN2(bytes_left
, 16), 4);
134 nir_ssa_def
*load
= dup_mem_intrinsic(b
, intrin
, NULL
, load_offset
,
135 load_comps
, load_bit_size
,
138 nir_ssa_def
*unpacked
= nir_bitcast_vector(b
, load
, bit_size
);
139 for (unsigned i
= 0; i
< unpacked
->num_components
; i
++) {
140 if (res_idx
< num_components
)
141 result
[res_idx
++] = nir_channel(b
, unpacked
, i
);
144 load_offset
+= load_comps
* (load_bit_size
/ 8);
148 nir_ssa_def
*vec_result
= nir_vec(b
, result
, num_components
);
149 nir_ssa_def_rewrite_uses(&intrin
->dest
.ssa
,
150 nir_src_for_ssa(vec_result
));
151 nir_instr_remove(&intrin
->instr
);
157 lower_mem_store_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
)
159 assert(intrin
->src
[0].is_ssa
);
160 nir_ssa_def
*value
= intrin
->src
[0].ssa
;
162 assert(intrin
->num_components
== value
->num_components
);
163 const unsigned bit_size
= value
->bit_size
;
164 const unsigned num_components
= intrin
->num_components
;
165 const unsigned bytes_written
= num_components
* (bit_size
/ 8);
166 const unsigned align_mul
= nir_intrinsic_align_mul(intrin
);
167 const unsigned align_offset
= nir_intrinsic_align_offset(intrin
);
168 const unsigned align
= nir_intrinsic_align(intrin
);
170 nir_component_mask_t writemask
= nir_intrinsic_write_mask(intrin
);
171 assert(writemask
< (1 << num_components
));
173 if ((value
->bit_size
<= 32 && num_components
== 1) ||
174 (value
->bit_size
== 32 && writemask
== (1 << num_components
) - 1))
177 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
178 const bool offset_is_const
= nir_src_is_const(*offset_src
);
179 const unsigned const_offset
=
180 offset_is_const
? nir_src_as_uint(*offset_src
) : 0;
182 const unsigned byte_size
= bit_size
/ 8;
183 assert(byte_size
<= sizeof(uint64_t));
185 BITSET_DECLARE(mask
, NIR_MAX_VEC_COMPONENTS
* sizeof(uint64_t));
188 for (unsigned i
= 0; i
< num_components
; i
++) {
189 if (writemask
& (1u << i
))
190 BITSET_SET_RANGE(mask
, i
* byte_size
, ((i
+ 1) * byte_size
) - 1);
193 while (BITSET_FFS(mask
) != 0) {
194 const int start
= BITSET_FFS(mask
) - 1;
195 assert(start
% byte_size
== 0);
198 for (end
= start
+ 1; end
< bytes_written
; end
++) {
199 if (!(BITSET_TEST(mask
, end
)))
202 /* The size of the current contiguous chunk in bytes */
203 const unsigned chunk_bytes
= end
- start
;
205 const bool is_dword_aligned
=
206 (align_mul
>= 4 && (align_offset
+ start
) % 4 == 0) ||
207 (offset_is_const
&& (start
+ const_offset
) % 4 == 0);
209 unsigned store_comps
, store_bit_size
, store_align
;
210 if (chunk_bytes
>= 4 && is_dword_aligned
) {
211 store_align
= MAX2(align
, 4);
213 store_comps
= MIN2(chunk_bytes
, 16) / 4;
217 store_bit_size
= MIN2(chunk_bytes
, 4) * 8;
218 /* The bit size must be a power of two */
219 if (store_bit_size
== 24)
223 const unsigned store_bytes
= store_comps
* (store_bit_size
/ 8);
224 assert(store_bytes
% byte_size
== 0);
225 const unsigned store_first_src_comp
= start
/ byte_size
;
226 const unsigned store_src_comps
= store_bytes
/ byte_size
;
227 assert(store_first_src_comp
+ store_src_comps
<= num_components
);
229 unsigned src_swiz
[4] = { 0, };
230 for (unsigned i
= 0; i
< store_src_comps
; i
++)
231 src_swiz
[i
] = store_first_src_comp
+ i
;
232 nir_ssa_def
*store_value
=
233 nir_swizzle(b
, value
, src_swiz
, store_src_comps
);
234 nir_ssa_def
*packed
= nir_bitcast_vector(b
, store_value
, store_bit_size
);
236 dup_mem_intrinsic(b
, intrin
, packed
, start
,
237 store_comps
, store_bit_size
, store_align
);
239 BITSET_CLEAR_RANGE(mask
, start
, (start
+ store_bytes
- 1));
242 nir_instr_remove(&intrin
->instr
);
248 lower_mem_access_bit_sizes_impl(nir_function_impl
*impl
)
250 bool progress
= false;
253 nir_builder_init(&b
, impl
);
255 nir_foreach_block(block
, impl
) {
256 nir_foreach_instr_safe(instr
, block
) {
257 if (instr
->type
!= nir_instr_type_intrinsic
)
260 b
.cursor
= nir_after_instr(instr
);
262 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
263 switch (intrin
->intrinsic
) {
264 case nir_intrinsic_load_global
:
265 case nir_intrinsic_load_ssbo
:
266 case nir_intrinsic_load_shared
:
267 if (lower_mem_load_bit_size(&b
, intrin
))
271 case nir_intrinsic_store_global
:
272 case nir_intrinsic_store_ssbo
:
273 case nir_intrinsic_store_shared
:
274 if (lower_mem_store_bit_size(&b
, intrin
))
285 nir_metadata_preserve(impl
, nir_metadata_block_index
|
286 nir_metadata_dominance
);
293 * This pass loads arbitrary SSBO and shared memory load/store operations to
294 * intrinsics which are natively handleable by GEN hardware. In particular,
295 * we have two general types of memory load/store messages:
297 * - Untyped surface read/write: These can load/store between one and four
298 * dword components to/from a dword-aligned offset.
300 * - Byte scattered read/write: These can load/store a single byte, word, or
301 * dword scalar to/from an unaligned byte offset.
303 * Neither type of message can do a write-masked store. This pass converts
304 * all nir load/store intrinsics into a series of either 8 or 32-bit
305 * load/store intrinsics with a number of components that we can directly
306 * handle in hardware and with a trivial write-mask.
309 brw_nir_lower_mem_access_bit_sizes(nir_shader
*shader
)
311 bool progress
= false;
313 nir_foreach_function(func
, shader
) {
314 if (func
->impl
&& lower_mem_access_bit_sizes_impl(func
->impl
))