2 * Copyright (C) 2020 Collabora, Ltd.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
27 * BI_COMBINE instruction, e.g.:
29 * v = combine x, y, z, w
31 * These combines need to be lowered by the pass in this file. Fix a given
32 * source at component c.
34 * First suppose the source is SSA. If it is also scalar, then we may rewrite
35 * the destination of the generating instruction (unique by SSA+scalar) to
36 * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
37 * (the original by scalar). If it is vector, there are two cases. If the
38 * component c is `x`, we are accessing v.x, and each of the succeeding
39 * components y, z... up to the last component of the vector are accessed
40 * sequentially, then we may perform the same rewrite. If this is not the case,
41 * rewriting would require a swizzle or writemask (TODO), so we fallback on a
44 * Otherwise is the source is not SSA, we also fallback on a move. We could
49 bi_insert_combine_mov(bi_context
*ctx
, bi_instruction
*parent
, unsigned comp
, unsigned R
)
51 unsigned bits
= nir_alu_type_get_type_size(parent
->dest_type
);
52 unsigned bytes
= bits
/ 8;
54 bi_instruction move
= {
57 .dest_type
= parent
->dest_type
,
58 .writemask
= ((1 << bytes
) - 1) << (bytes
* comp
),
59 .src
= { parent
->src
[comp
] },
60 .src_types
= { parent
->dest_type
},
61 .swizzle
= { { parent
->swizzle
[comp
][0] } }
64 bi_emit_before(ctx
, parent
, move
);
67 /* Gets the instruction generating a given source. Combine lowering is
68 * accidentally O(n^2) right now because this function is O(n) instead of O(1).
69 * If this pass is slow, this cost can be avoided in favour for better
72 static bi_instruction
*
73 bi_get_parent(bi_context
*ctx
, unsigned idx
, unsigned mask
)
75 bi_foreach_instr_global(ctx
, ins
) {
77 if ((ins
->writemask
& mask
) == mask
)
84 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
85 * currently O(nc) to the program and number of combines, so the pass becomes
86 * effectively O(n^2). Better bookkeeping would bring down to linear if that's
90 bi_rewrite_uses(bi_context
*ctx
,
91 unsigned old
, unsigned oldc
,
92 unsigned new, unsigned newc
)
94 bi_foreach_instr_global(ctx
, ins
) {
95 bi_foreach_src(ins
, s
) {
96 if (ins
->src
[s
] != old
) continue;
98 for (unsigned i
= 0; i
< 16; ++i
)
99 ins
->swizzle
[s
][0] += (newc
- oldc
);
106 /* Shifts the writemask of an instruction by a specified byte count,
107 * rotating the sources to compensate. Returns true if successful, and
108 * returns false if not (nondestructive in this case). */
111 bi_shift_mask(bi_instruction
*ins
, unsigned shift
)
113 /* No op and handles the funny cases */
117 unsigned sz
= nir_alu_type_get_type_size(ins
->dest_type
);
118 unsigned bytes
= sz
/ 8;
120 /* If things are misaligned, we bail. Check if shift % bytes is
121 * nonzero. Note bytes is a power-of-two. */
122 if (shift
& (bytes
- 1))
125 /* Ensure there are no funny types */
126 bi_foreach_src(ins
, s
) {
127 if (ins
->src
[s
] && nir_alu_type_get_type_size(ins
->src_types
[s
]) != sz
)
131 /* Shift swizzle so old i'th component is accessed by new (i + j)'th
132 * component where j is component shift */
133 unsigned component_shift
= shift
/ bytes
;
135 /* Sanity check to avoid memory corruption */
136 if (component_shift
>= sizeof(ins
->swizzle
[0]))
139 /* Otherwise, shift is divisible by bytes, and all relevant src types
140 * are the same size as the dest type. */
141 ins
->writemask
<<= shift
;
143 bi_foreach_src(ins
, s
) {
144 if (!ins
->src
[s
]) continue;
146 size_t overlap
= sizeof(ins
->swizzle
[s
]) - component_shift
;
147 memmove(ins
->swizzle
[s
] + component_shift
, ins
->swizzle
[s
], overlap
);
153 /* Checks if we have a nicely aligned vector prefix */
156 bi_is_aligned_vec(bi_instruction
*combine
, unsigned s
, bi_instruction
*parent
,
159 /* We only support prefixes */
163 /* Is it a contiguous write? */
164 unsigned writes
= util_bitcount(parent
->writemask
);
165 if (parent
->writemask
!= ((1 << writes
) - 1))
168 /* Okay - how many components? */
169 unsigned bytes
= nir_alu_type_get_type_size(parent
->dest_type
) / 8;
170 unsigned components
= writes
/ bytes
;
172 /* Are we contiguous like that? */
174 for (unsigned i
= 0; i
< components
; ++i
) {
175 if (combine
->src
[i
] != parent
->dest
)
178 if (combine
->swizzle
[i
][0] != i
)
182 /* We're good to go */
187 /* Tries to lower a given source of a combine to an appropriate rewrite,
188 * returning true if successful, and false with no changes otherwise. */
191 bi_lower_combine_src(bi_context
*ctx
, bi_instruction
*ins
, unsigned s
, unsigned R
,
194 unsigned src
= ins
->src
[s
];
196 /* We currently only handle SSA */
198 if (!src
) return false;
199 if (src
& (BIR_SPECIAL
| BIR_IS_REG
)) return false;
201 /* We are SSA. Lookup the generating instruction. */
202 unsigned bytes
= nir_alu_type_get_type_size(ins
->dest_type
) / 8;
204 bi_instruction
*parent
= bi_get_parent(ctx
, src
,
205 0xF << (ins
->swizzle
[s
][0] * bytes
));
207 if (!parent
) return false;
209 /* We have a parent instuction, sanity check the typesize */
210 unsigned pbytes
= nir_alu_type_get_type_size(parent
->dest_type
) / 8;
211 if (pbytes
!= bytes
) return false;
213 bool scalar
= (parent
->writemask
== ((1 << bytes
) - 1));
214 if (!(scalar
|| bi_is_aligned_vec(ins
, s
, parent
, vec_count
))) return false;
216 if (!bi_shift_mask(parent
, bytes
* s
)) return false;
217 bi_rewrite_uses(ctx
, parent
->dest
, 0, R
, s
);
223 bi_lower_combine(bi_context
*ctx
, bi_block
*block
)
225 bi_foreach_instr_in_block_safe(block
, ins
) {
226 if (ins
->type
!= BI_COMBINE
) continue;
228 /* The vector itself can't be shifted */
229 assert(ins
->writemask
& 0x1);
231 unsigned R
= bi_make_temp_reg(ctx
);
233 bi_foreach_src(ins
, s
) {
234 unsigned vec_count
= 0;
236 if (bi_lower_combine_src(ctx
, ins
, s
, R
, &vec_count
)) {
237 /* Skip vectored sources */
239 s
+= (vec_count
- 1);
241 bi_insert_combine_mov(ctx
, ins
, s
, R
);
246 bi_rewrite_uses(ctx
, ins
->dest
, 0, R
, 0);
247 bi_remove_instruction(ins
);