nir: Add a lowering pass to split 64bit phis
[mesa.git] / src / compiler / nir / nir_lower_ubo_vec4.c
1 /*
2 * Copyright © 2020 Google LLC
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an
25 * offset in vec4 units. This is a fairly common mode of UBO addressing for
26 * hardware to have, and it gives NIR a chance to optimize the addressing math
27 * and CSE the loads.
28 *
29 * We assume that the UBO loads do not cross a vec4 boundary. This is true
30 * for:
31 * - std140 (GLSL 1.40, GLSL ES)
32 * - Vulkan "Extended Layout" (the baseline for UBOs)
33 *
34 * but not:
35 *
36 * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where
37 * vec3 arrays are packed tightly.
38 *
39 * - Vulkan's scalarBlockLayout optional feature:
40 *
41 * "A member is defined to improperly straddle if either of the following are
42 * true:
43 *
44 * • It is a vector with total size less than or equal to 16 bytes, and has
45 * Offset decorations placing its first byte at F and its last byte at L
46 * where floor(F / 16) != floor(L / 16).
47 * • It is a vector with total size greater than 16 bytes and has its Offset
48 * decorations placing its first byte at a non-integer multiple of 16.
49 *
50 * [...]
51 *
52 * Unless the scalarBlockLayout feature is enabled on the device:
53 *
54 * • Vectors must not improperly straddle, as defined above."
55 */
56
57 #include "nir.h"
58 #include "nir_builder.h"
59
60 static bool
61 nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data)
62 {
63 if (instr->type != nir_instr_type_intrinsic)
64 return false;
65
66 return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo;
67 }
68
69 static nir_ssa_def *
70 nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data)
71 {
72 b->cursor = nir_before_instr(instr);
73
74 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
75
76 nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1);
77
78 nir_intrinsic_instr *load =
79 nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_vec4);
80 nir_src_copy(&load->src[0], &intr->src[0], &load->instr);
81 load->src[1] = nir_src_for_ssa(nir_ushr_imm(b, byte_offset, 4));
82
83 unsigned align_mul = nir_intrinsic_align_mul(intr);
84 unsigned align_offset = nir_intrinsic_align_offset(intr);
85
86 int chan_size_bytes = intr->dest.ssa.bit_size / 8;
87 int chans_per_vec4 = 16 / chan_size_bytes;
88
89 /* We don't care if someone figured out that things are aligned beyond
90 * vec4.
91 */
92 align_mul = MIN2(align_mul, 16);
93 align_offset &= 15;
94 assert(align_offset % chan_size_bytes == 0);
95
96 /* We assume that loads don't cross vec4 boundaries, just that we need
97 * to extract from within the vec4 when we don't have a good alignment.
98 */
99 if (intr->num_components == chans_per_vec4) {
100 align_mul = 16;
101 align_offset = 0;
102 }
103
104 unsigned num_components = intr->num_components;
105 bool aligned_mul = align_mul % 16 == 0;
106 if (!aligned_mul)
107 num_components = chans_per_vec4;
108
109 nir_ssa_dest_init(&load->instr, &load->dest,
110 num_components, intr->dest.ssa.bit_size,
111 intr->dest.ssa.name);
112 load->num_components = num_components;
113 nir_builder_instr_insert(b, &load->instr);
114
115 nir_ssa_def *result = &load->dest.ssa;
116
117 int align_chan_offset = align_offset / chan_size_bytes;
118 if (aligned_mul) {
119 /* For an aligned load, just ask the backend to load from the known
120 * offset's component.
121 */
122 nir_intrinsic_set_component(load, align_chan_offset);
123 } else {
124 if (align_mul == 8) {
125 /* Special case: Loading small vectors from offset % 8 == 0 can be
126 * done with just one bcsel.
127 */
128 nir_component_mask_t low_channels =
129 BITSET_MASK(intr->num_components) << (align_chan_offset);
130 nir_component_mask_t high_channels =
131 low_channels << (8 / chan_size_bytes);
132 result = nir_bcsel(b,
133 nir_i2b(b, nir_iand_imm(b, byte_offset, 8)),
134 nir_channels(b, result, high_channels),
135 nir_channels(b, result, low_channels));
136 } else {
137 /* General fallback case: Per-result-channel bcsel-based extraction
138 * from the load.
139 */
140 assert(align_mul == 4);
141 assert(align_chan_offset == 0);
142
143 nir_ssa_def *component =
144 nir_iand_imm(b,
145 nir_udiv_imm(b, byte_offset, chan_size_bytes),
146 chans_per_vec4 - 1);
147
148 nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
149 for (unsigned i = 0; i < intr->num_components; i++) {
150 channels[i] = nir_vector_extract(b, result,
151 nir_iadd_imm(b, component, i));
152 }
153
154 result = nir_vec(b, channels, intr->num_components);
155 }
156 }
157
158 return result;
159 }
160
161 bool
162 nir_lower_ubo_vec4(nir_shader *shader)
163 {
164 return nir_shader_lower_instructions(shader,
165 nir_lower_ubo_vec4_filter,
166 nir_lower_ubo_vec4_lower,
167 NULL);
168 }