f67a414e873083309a223931ceb8897836615a2a
[mesa.git] / src / intel / compiler / brw_nir_lower_mem_access_bit_sizes.c
1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir.h"
25 #include "compiler/nir/nir_builder.h"
26 #include "util/u_math.h"
27 #include "util/bitscan.h"
28
29 static nir_ssa_def *
30 dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
31 nir_ssa_def *store_src, int offset,
32 unsigned num_components, unsigned bit_size,
33 unsigned align)
34 {
35 const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
36
37 nir_intrinsic_instr *dup =
38 nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
39
40 nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
41 for (unsigned i = 0; i < info->num_srcs; i++) {
42 assert(intrin->src[i].is_ssa);
43 if (i == 0 && store_src) {
44 assert(!info->has_dest);
45 assert(&intrin->src[i] != intrin_offset_src);
46 dup->src[i] = nir_src_for_ssa(store_src);
47 } else if (&intrin->src[i] == intrin_offset_src) {
48 dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
49 offset));
50 } else {
51 dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
52 }
53 }
54
55 dup->num_components = num_components;
56
57 for (unsigned i = 0; i < info->num_indices; i++)
58 dup->const_index[i] = intrin->const_index[i];
59
60 nir_intrinsic_set_align(dup, align, 0);
61
62 if (info->has_dest) {
63 assert(intrin->dest.is_ssa);
64 nir_ssa_dest_init(&dup->instr, &dup->dest,
65 num_components, bit_size,
66 intrin->dest.ssa.name);
67 } else {
68 nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
69 }
70
71 nir_builder_instr_insert(b, &dup->instr);
72
73 return info->has_dest ? &dup->dest.ssa : NULL;
74 }
75
76 static bool
77 lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
78 const struct gen_device_info *devinfo)
79 {
80 const bool needs_scalar =
81 intrin->intrinsic == nir_intrinsic_load_scratch;
82
83 assert(intrin->dest.is_ssa);
84 const unsigned bit_size = intrin->dest.ssa.bit_size;
85 const unsigned num_components = intrin->dest.ssa.num_components;
86 const unsigned bytes_read = num_components * (bit_size / 8);
87 const unsigned align = nir_intrinsic_align(intrin);
88
89 if (bit_size == 32 && align >= 32 &&
90 (!needs_scalar || intrin->num_components == 1))
91 return false;
92
93 nir_ssa_def *result;
94 nir_src *offset_src = nir_get_io_offset_src(intrin);
95 if (bit_size < 32 && nir_src_is_const(*offset_src)) {
96 /* The offset is constant so we can use a 32-bit load and just shift it
97 * around as needed.
98 */
99 const int load_offset = nir_src_as_uint(*offset_src) % 4;
100 assert(load_offset % (bit_size / 8) == 0);
101 const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
102 /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
103 * we offset into a component with load_offset.
104 */
105 assert(load_comps32 <= 3);
106
107 nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
108 load_comps32, 32, 4);
109 result = nir_extract_bits(b, &load, 1, load_offset * 8,
110 num_components, bit_size);
111 } else {
112 /* Otherwise, we have to break it into smaller loads */
113 nir_ssa_def *loads[8];
114 unsigned num_loads = 0;
115 int load_offset = 0;
116 while (load_offset < bytes_read) {
117 const unsigned bytes_left = bytes_read - load_offset;
118 unsigned load_bit_size, load_comps;
119 if (align < 4) {
120 load_comps = 1;
121 /* Choose a byte, word, or dword */
122 load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
123 } else {
124 assert(load_offset % 4 == 0);
125 load_bit_size = 32;
126 load_comps = needs_scalar ? 1 :
127 DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
128 }
129
130 loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
131 load_comps, load_bit_size,
132 align);
133
134 load_offset += load_comps * (load_bit_size / 8);
135 }
136 assert(num_loads <= ARRAY_SIZE(loads));
137 result = nir_extract_bits(b, loads, num_loads, 0,
138 num_components, bit_size);
139 }
140
141 nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
142 nir_src_for_ssa(result));
143 nir_instr_remove(&intrin->instr);
144
145 return true;
146 }
147
148 static bool
149 lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
150 const struct gen_device_info *devinfo)
151 {
152 const bool needs_scalar =
153 intrin->intrinsic == nir_intrinsic_store_scratch;
154
155 assert(intrin->src[0].is_ssa);
156 nir_ssa_def *value = intrin->src[0].ssa;
157
158 assert(intrin->num_components == value->num_components);
159 const unsigned bit_size = value->bit_size;
160 const unsigned num_components = intrin->num_components;
161 const unsigned bytes_written = num_components * (bit_size / 8);
162 const unsigned align_mul = nir_intrinsic_align_mul(intrin);
163 const unsigned align_offset = nir_intrinsic_align_offset(intrin);
164 const unsigned align = nir_intrinsic_align(intrin);
165
166 nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
167 assert(writemask < (1 << num_components));
168
169 if ((value->bit_size <= 32 && num_components == 1) ||
170 (value->bit_size == 32 && align >= 32 &&
171 writemask == (1 << num_components) - 1 &&
172 !needs_scalar))
173 return false;
174
175 nir_src *offset_src = nir_get_io_offset_src(intrin);
176 const bool offset_is_const = nir_src_is_const(*offset_src);
177 const unsigned const_offset =
178 offset_is_const ? nir_src_as_uint(*offset_src) : 0;
179
180 const unsigned byte_size = bit_size / 8;
181 assert(byte_size <= sizeof(uint64_t));
182
183 BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
184 BITSET_ZERO(mask);
185
186 for (unsigned i = 0; i < num_components; i++) {
187 if (writemask & (1u << i))
188 BITSET_SET_RANGE(mask, i * byte_size, ((i + 1) * byte_size) - 1);
189 }
190
191 while (BITSET_FFS(mask) != 0) {
192 const int start = BITSET_FFS(mask) - 1;
193
194 int end;
195 for (end = start + 1; end < bytes_written; end++) {
196 if (!(BITSET_TEST(mask, end)))
197 break;
198 }
199 /* The size of the current contiguous chunk in bytes */
200 const unsigned chunk_bytes = end - start;
201
202 const bool is_dword_aligned =
203 (align_mul >= 4 && (align_offset + start) % 4 == 0) ||
204 (offset_is_const && (start + const_offset) % 4 == 0);
205
206 unsigned store_comps, store_bit_size, store_align;
207 if (chunk_bytes >= 4 && is_dword_aligned) {
208 store_align = MAX2(align, 4);
209 store_bit_size = 32;
210 store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
211 } else {
212 store_align = align;
213 store_comps = 1;
214 store_bit_size = MIN2(chunk_bytes, 4) * 8;
215 /* The bit size must be a power of two */
216 if (store_bit_size == 24)
217 store_bit_size = 16;
218 }
219 const unsigned store_bytes = store_comps * (store_bit_size / 8);
220
221 nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
222 store_comps, store_bit_size);
223
224 dup_mem_intrinsic(b, intrin, packed, start,
225 store_comps, store_bit_size, store_align);
226
227 BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1));
228 }
229
230 nir_instr_remove(&intrin->instr);
231
232 return true;
233 }
234
235 static bool
236 lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
237 const struct gen_device_info *devinfo)
238 {
239 bool progress = false;
240
241 nir_builder b;
242 nir_builder_init(&b, impl);
243
244 nir_foreach_block(block, impl) {
245 nir_foreach_instr_safe(instr, block) {
246 if (instr->type != nir_instr_type_intrinsic)
247 continue;
248
249 b.cursor = nir_after_instr(instr);
250
251 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
252 switch (intrin->intrinsic) {
253 case nir_intrinsic_load_global:
254 case nir_intrinsic_load_ssbo:
255 case nir_intrinsic_load_shared:
256 case nir_intrinsic_load_scratch:
257 if (lower_mem_load_bit_size(&b, intrin, devinfo))
258 progress = true;
259 break;
260
261 case nir_intrinsic_store_global:
262 case nir_intrinsic_store_ssbo:
263 case nir_intrinsic_store_shared:
264 case nir_intrinsic_store_scratch:
265 if (lower_mem_store_bit_size(&b, intrin, devinfo))
266 progress = true;
267 break;
268
269 default:
270 break;
271 }
272 }
273 }
274
275 if (progress) {
276 nir_metadata_preserve(impl, nir_metadata_block_index |
277 nir_metadata_dominance);
278 } else {
279 nir_metadata_preserve(impl, nir_metadata_all);
280 }
281
282 return progress;
283 }
284
285 /**
286 * This pass loads arbitrary SSBO and shared memory load/store operations to
287 * intrinsics which are natively handleable by GEN hardware. In particular,
288 * we have two general types of memory load/store messages:
289 *
290 * - Untyped surface read/write: These can load/store between one and four
291 * dword components to/from a dword-aligned offset.
292 *
293 * - Byte scattered read/write: These can load/store a single byte, word, or
294 * dword scalar to/from an unaligned byte offset.
295 *
296 * Neither type of message can do a write-masked store. This pass converts
297 * all nir load/store intrinsics into a series of either 8 or 32-bit
298 * load/store intrinsics with a number of components that we can directly
299 * handle in hardware and with a trivial write-mask.
300 *
301 * For scratch access, additional consideration has to be made due to the way
302 * that we swizzle the memory addresses to achieve decent cache locality. In
303 * particular, even though untyped surface read/write messages exist and work,
304 * we can't use them to load multiple components in a single SEND. For more
305 * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
306 */
307 bool
308 brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
309 const struct gen_device_info *devinfo)
310 {
311 bool progress = false;
312
313 nir_foreach_function(func, shader) {
314 if (func->impl && lower_mem_access_bit_sizes_impl(func->impl, devinfo))
315 progress = true;
316 }
317
318 return progress;
319 }