2 * Copyright © 2020 Google LLC
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an
25 * offset in vec4 units. This is a fairly common mode of UBO addressing for
26 * hardware to have, and it gives NIR a chance to optimize the addressing math
29 * We assume that the UBO loads do not cross a vec4 boundary. This is true
31 * - std140 (GLSL 1.40, GLSL ES)
32 * - Vulkan "Extended Layout" (the baseline for UBOs)
36 * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where
37 * vec3 arrays are packed tightly.
39 * - Vulkan's scalarBlockLayout optional feature:
41 * "A member is defined to improperly straddle if either of the following are
44 * • It is a vector with total size less than or equal to 16 bytes, and has
45 * Offset decorations placing its first byte at F and its last byte at L
46 * where floor(F / 16) != floor(L / 16).
47 * • It is a vector with total size greater than 16 bytes and has its Offset
48 * decorations placing its first byte at a non-integer multiple of 16.
52 * Unless the scalarBlockLayout feature is enabled on the device:
54 * • Vectors must not improperly straddle, as defined above."
58 #include "nir_builder.h"
61 nir_lower_ubo_vec4_filter(const nir_instr
*instr
, const void *data
)
63 if (instr
->type
!= nir_instr_type_intrinsic
)
66 return nir_instr_as_intrinsic(instr
)->intrinsic
== nir_intrinsic_load_ubo
;
70 nir_lower_ubo_vec4_lower(nir_builder
*b
, nir_instr
*instr
, void *data
)
72 b
->cursor
= nir_before_instr(instr
);
74 nir_intrinsic_instr
*intr
= nir_instr_as_intrinsic(instr
);
76 nir_ssa_def
*byte_offset
= nir_ssa_for_src(b
, intr
->src
[1], 1);
78 nir_intrinsic_instr
*load
=
79 nir_intrinsic_instr_create(b
->shader
, nir_intrinsic_load_ubo_vec4
);
80 nir_src_copy(&load
->src
[0], &intr
->src
[0], &load
->instr
);
81 load
->src
[1] = nir_src_for_ssa(nir_ushr_imm(b
, byte_offset
, 4));
83 unsigned align_mul
= nir_intrinsic_align_mul(intr
);
84 unsigned align_offset
= nir_intrinsic_align_offset(intr
);
86 int chan_size_bytes
= intr
->dest
.ssa
.bit_size
/ 8;
87 int chans_per_vec4
= 16 / chan_size_bytes
;
89 /* We don't care if someone figured out that things are aligned beyond
92 align_mul
= MIN2(align_mul
, 16);
94 assert(align_offset
% chan_size_bytes
== 0);
96 /* We assume that loads don't cross vec4 boundaries, just that we need
97 * to extract from within the vec4 when we don't have a good alignment.
99 if (intr
->num_components
== chans_per_vec4
) {
104 unsigned num_components
= intr
->num_components
;
105 bool aligned_mul
= align_mul
% 16 == 0;
107 num_components
= chans_per_vec4
;
109 nir_ssa_dest_init(&load
->instr
, &load
->dest
,
110 num_components
, intr
->dest
.ssa
.bit_size
,
111 intr
->dest
.ssa
.name
);
112 load
->num_components
= num_components
;
113 nir_builder_instr_insert(b
, &load
->instr
);
115 nir_ssa_def
*result
= &load
->dest
.ssa
;
117 int align_chan_offset
= align_offset
/ chan_size_bytes
;
119 /* For an aligned load, just ask the backend to load from the known
120 * offset's component.
122 nir_intrinsic_set_component(load
, align_chan_offset
);
124 if (align_mul
== 8) {
125 /* Special case: Loading small vectors from offset % 8 == 0 can be
126 * done with just one bcsel.
128 nir_component_mask_t low_channels
=
129 BITSET_MASK(intr
->num_components
) << (align_chan_offset
);
130 nir_component_mask_t high_channels
=
131 low_channels
<< (8 / chan_size_bytes
);
132 result
= nir_bcsel(b
,
133 nir_i2b(b
, nir_iand_imm(b
, byte_offset
, 8)),
134 nir_channels(b
, result
, high_channels
),
135 nir_channels(b
, result
, low_channels
));
137 /* General fallback case: Per-result-channel bcsel-based extraction
140 assert(align_mul
== 4);
141 assert(align_chan_offset
== 0);
143 nir_ssa_def
*component
=
145 nir_udiv_imm(b
, byte_offset
, chan_size_bytes
),
148 nir_ssa_def
*channels
[NIR_MAX_VEC_COMPONENTS
];
149 for (unsigned i
= 0; i
< intr
->num_components
; i
++) {
150 channels
[i
] = nir_vector_extract(b
, result
,
151 nir_iadd_imm(b
, component
, i
));
154 result
= nir_vec(b
, channels
, intr
->num_components
);
162 nir_lower_ubo_vec4(nir_shader
*shader
)
164 return nir_shader_lower_instructions(shader
,
165 nir_lower_ubo_vec4_filter
,
166 nir_lower_ubo_vec4_lower
,