2 * Copyright © 2018 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "compiler/nir/nir_builder.h"
26 #include "util/u_math.h"
27 #include "util/bitscan.h"
30 dup_mem_intrinsic(nir_builder
*b
, nir_intrinsic_instr
*intrin
,
31 nir_ssa_def
*store_src
, int offset
,
32 unsigned num_components
, unsigned bit_size
,
35 const nir_intrinsic_info
*info
= &nir_intrinsic_infos
[intrin
->intrinsic
];
37 nir_intrinsic_instr
*dup
=
38 nir_intrinsic_instr_create(b
->shader
, intrin
->intrinsic
);
40 nir_src
*intrin_offset_src
= nir_get_io_offset_src(intrin
);
41 for (unsigned i
= 0; i
< info
->num_srcs
; i
++) {
42 assert(intrin
->src
[i
].is_ssa
);
43 if (i
== 0 && store_src
) {
44 assert(!info
->has_dest
);
45 assert(&intrin
->src
[i
] != intrin_offset_src
);
46 dup
->src
[i
] = nir_src_for_ssa(store_src
);
47 } else if (&intrin
->src
[i
] == intrin_offset_src
) {
48 dup
->src
[i
] = nir_src_for_ssa(nir_iadd_imm(b
, intrin
->src
[i
].ssa
,
51 dup
->src
[i
] = nir_src_for_ssa(intrin
->src
[i
].ssa
);
55 dup
->num_components
= num_components
;
57 for (unsigned i
= 0; i
< info
->num_indices
; i
++)
58 dup
->const_index
[i
] = intrin
->const_index
[i
];
60 nir_intrinsic_set_align(dup
, align
, 0);
63 assert(intrin
->dest
.is_ssa
);
64 nir_ssa_dest_init(&dup
->instr
, &dup
->dest
,
65 num_components
, bit_size
,
66 intrin
->dest
.ssa
.name
);
68 nir_intrinsic_set_write_mask(dup
, (1 << num_components
) - 1);
71 nir_builder_instr_insert(b
, &dup
->instr
);
73 return info
->has_dest
? &dup
->dest
.ssa
: NULL
;
77 lower_mem_load_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
,
78 const struct gen_device_info
*devinfo
)
80 const bool needs_scalar
=
81 intrin
->intrinsic
== nir_intrinsic_load_scratch
;
83 assert(intrin
->dest
.is_ssa
);
84 if (intrin
->dest
.ssa
.bit_size
== 32 &&
85 (!needs_scalar
|| intrin
->num_components
== 1))
88 const unsigned bit_size
= intrin
->dest
.ssa
.bit_size
;
89 const unsigned num_components
= intrin
->dest
.ssa
.num_components
;
90 const unsigned bytes_read
= num_components
* (bit_size
/ 8);
91 const unsigned align
= nir_intrinsic_align(intrin
);
94 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
95 if (bit_size
< 32 && nir_src_is_const(*offset_src
)) {
96 /* The offset is constant so we can use a 32-bit load and just shift it
99 const int load_offset
= nir_src_as_uint(*offset_src
) % 4;
100 assert(load_offset
% (bit_size
/ 8) == 0);
101 const unsigned load_comps32
= DIV_ROUND_UP(bytes_read
+ load_offset
, 4);
102 /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
103 * we offset into a component with load_offset.
105 assert(load_comps32
<= 3);
107 nir_ssa_def
*load
= dup_mem_intrinsic(b
, intrin
, NULL
, -load_offset
,
108 load_comps32
, 32, 4);
109 result
= nir_extract_bits(b
, &load
, 1, load_offset
* 8,
110 num_components
, bit_size
);
112 /* Otherwise, we have to break it into smaller loads */
113 nir_ssa_def
*loads
[8];
114 unsigned num_loads
= 0;
116 while (load_offset
< bytes_read
) {
117 const unsigned bytes_left
= bytes_read
- load_offset
;
118 unsigned load_bit_size
, load_comps
;
121 /* Choose a byte, word, or dword */
122 load_bit_size
= util_next_power_of_two(MIN2(bytes_left
, 4)) * 8;
124 assert(load_offset
% 4 == 0);
126 load_comps
= needs_scalar
? 1 :
127 DIV_ROUND_UP(MIN2(bytes_left
, 16), 4);
130 loads
[num_loads
++] = dup_mem_intrinsic(b
, intrin
, NULL
, load_offset
,
131 load_comps
, load_bit_size
,
134 load_offset
+= load_comps
* (load_bit_size
/ 8);
136 assert(num_loads
<= ARRAY_SIZE(loads
));
137 result
= nir_extract_bits(b
, loads
, num_loads
, 0,
138 num_components
, bit_size
);
141 nir_ssa_def_rewrite_uses(&intrin
->dest
.ssa
,
142 nir_src_for_ssa(result
));
143 nir_instr_remove(&intrin
->instr
);
149 lower_mem_store_bit_size(nir_builder
*b
, nir_intrinsic_instr
*intrin
,
150 const struct gen_device_info
*devinfo
)
152 const bool needs_scalar
=
153 intrin
->intrinsic
== nir_intrinsic_store_scratch
;
155 assert(intrin
->src
[0].is_ssa
);
156 nir_ssa_def
*value
= intrin
->src
[0].ssa
;
158 assert(intrin
->num_components
== value
->num_components
);
159 const unsigned bit_size
= value
->bit_size
;
160 const unsigned num_components
= intrin
->num_components
;
161 const unsigned bytes_written
= num_components
* (bit_size
/ 8);
162 const unsigned align_mul
= nir_intrinsic_align_mul(intrin
);
163 const unsigned align_offset
= nir_intrinsic_align_offset(intrin
);
164 const unsigned align
= nir_intrinsic_align(intrin
);
166 nir_component_mask_t writemask
= nir_intrinsic_write_mask(intrin
);
167 assert(writemask
< (1 << num_components
));
169 if ((value
->bit_size
<= 32 && num_components
== 1) ||
170 (value
->bit_size
== 32 &&
171 writemask
== (1 << num_components
) - 1 &&
175 nir_src
*offset_src
= nir_get_io_offset_src(intrin
);
176 const bool offset_is_const
= nir_src_is_const(*offset_src
);
177 const unsigned const_offset
=
178 offset_is_const
? nir_src_as_uint(*offset_src
) : 0;
180 const unsigned byte_size
= bit_size
/ 8;
181 assert(byte_size
<= sizeof(uint64_t));
183 BITSET_DECLARE(mask
, NIR_MAX_VEC_COMPONENTS
* sizeof(uint64_t));
186 for (unsigned i
= 0; i
< num_components
; i
++) {
187 if (writemask
& (1u << i
))
188 BITSET_SET_RANGE(mask
, i
* byte_size
, ((i
+ 1) * byte_size
) - 1);
191 while (BITSET_FFS(mask
) != 0) {
192 const int start
= BITSET_FFS(mask
) - 1;
195 for (end
= start
+ 1; end
< bytes_written
; end
++) {
196 if (!(BITSET_TEST(mask
, end
)))
199 /* The size of the current contiguous chunk in bytes */
200 const unsigned chunk_bytes
= end
- start
;
202 const bool is_dword_aligned
=
203 (align_mul
>= 4 && (align_offset
+ start
) % 4 == 0) ||
204 (offset_is_const
&& (start
+ const_offset
) % 4 == 0);
206 unsigned store_comps
, store_bit_size
, store_align
;
207 if (chunk_bytes
>= 4 && is_dword_aligned
) {
208 store_align
= MAX2(align
, 4);
210 store_comps
= needs_scalar
? 1 : MIN2(chunk_bytes
, 16) / 4;
214 store_bit_size
= MIN2(chunk_bytes
, 4) * 8;
215 /* The bit size must be a power of two */
216 if (store_bit_size
== 24)
219 const unsigned store_bytes
= store_comps
* (store_bit_size
/ 8);
221 nir_ssa_def
*packed
= nir_extract_bits(b
, &value
, 1, start
* 8,
222 store_comps
, store_bit_size
);
224 dup_mem_intrinsic(b
, intrin
, packed
, start
,
225 store_comps
, store_bit_size
, store_align
);
227 BITSET_CLEAR_RANGE(mask
, start
, (start
+ store_bytes
- 1));
230 nir_instr_remove(&intrin
->instr
);
236 lower_mem_access_bit_sizes_impl(nir_function_impl
*impl
,
237 const struct gen_device_info
*devinfo
)
239 bool progress
= false;
242 nir_builder_init(&b
, impl
);
244 nir_foreach_block(block
, impl
) {
245 nir_foreach_instr_safe(instr
, block
) {
246 if (instr
->type
!= nir_instr_type_intrinsic
)
249 b
.cursor
= nir_after_instr(instr
);
251 nir_intrinsic_instr
*intrin
= nir_instr_as_intrinsic(instr
);
252 switch (intrin
->intrinsic
) {
253 case nir_intrinsic_load_global
:
254 case nir_intrinsic_load_ssbo
:
255 case nir_intrinsic_load_shared
:
256 case nir_intrinsic_load_scratch
:
257 if (lower_mem_load_bit_size(&b
, intrin
, devinfo
))
261 case nir_intrinsic_store_global
:
262 case nir_intrinsic_store_ssbo
:
263 case nir_intrinsic_store_shared
:
264 case nir_intrinsic_store_scratch
:
265 if (lower_mem_store_bit_size(&b
, intrin
, devinfo
))
276 nir_metadata_preserve(impl
, nir_metadata_block_index
|
277 nir_metadata_dominance
);
284 * This pass loads arbitrary SSBO and shared memory load/store operations to
285 * intrinsics which are natively handleable by GEN hardware. In particular,
286 * we have two general types of memory load/store messages:
288 * - Untyped surface read/write: These can load/store between one and four
289 * dword components to/from a dword-aligned offset.
291 * - Byte scattered read/write: These can load/store a single byte, word, or
292 * dword scalar to/from an unaligned byte offset.
294 * Neither type of message can do a write-masked store. This pass converts
295 * all nir load/store intrinsics into a series of either 8 or 32-bit
296 * load/store intrinsics with a number of components that we can directly
297 * handle in hardware and with a trivial write-mask.
299 * For scratch access, additional consideration has to be made due to the way
300 * that we swizzle the memory addresses to achieve decent cache locality. In
301 * particular, even though untyped surface read/write messages exist and work,
302 * we can't use them to load multiple components in a single SEND. For more
303 * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
306 brw_nir_lower_mem_access_bit_sizes(nir_shader
*shader
,
307 const struct gen_device_info
*devinfo
)
309 bool progress
= false;
311 nir_foreach_function(func
, shader
) {
312 if (func
->impl
&& lower_mem_access_bit_sizes_impl(func
->impl
, devinfo
))