2 * Copyright (C) 2020 Collabora, Ltd.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
27 * BI_COMBINE instruction, e.g.:
29 * v = combine x, y, z, w
31 * These combines need to be lowered by the pass in this file. Fix a given
32 * source at component c.
34 * First suppose the source is SSA. If it is also scalar, then we may rewrite
35 * the destination of the generating instruction (unique by SSA+scalar) to
36 * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
37 * (the original by scalar). If it is vector, there are two cases. If the
38 * component c is `x`, we are accessing v.x, and each of the succeeding
39 * components y, z... up to the last component of the vector are accessed
40 * sequentially, then we may perform the same rewrite. If this is not the case,
41 * rewriting would require more complex vector features, so we fallback on a
44 * Otherwise is the source is not SSA, we also fallback on a move. We could
49 bi_combine_mov32(bi_context
*ctx
, bi_instruction
*parent
, unsigned comp
, unsigned R
)
51 bi_instruction move
= {
54 .dest_type
= nir_type_uint32
,
56 .src
= { parent
->src
[comp
] },
57 .src_types
= { nir_type_uint32
},
58 .swizzle
= { { parent
->swizzle
[comp
][0] } }
61 bi_emit_before(ctx
, parent
, move
);
65 bi_combine_sel16(bi_context
*ctx
, bi_instruction
*parent
, unsigned comp
, unsigned R
)
67 bi_instruction sel
= {
70 .dest_type
= nir_type_uint32
,
71 .dest_offset
= comp
>> 1,
72 .src
= { parent
->src
[comp
], parent
->src
[comp
+ 1] },
73 .src_types
= { nir_type_uint16
, nir_type_uint16
},
75 { parent
->swizzle
[comp
][0] },
76 { parent
->swizzle
[comp
+ 1][0] },
80 /* In case we have a combine from a vec3 */
82 sel
.src
[1] = BIR_INDEX_ZERO
;
84 bi_emit_before(ctx
, parent
, sel
);
87 /* Gets the instruction generating a given source. Combine lowering is
88 * accidentally O(n^2) right now because this function is O(n) instead of O(1).
89 * If this pass is slow, this cost can be avoided in favour for better
93 static bi_instruction
*
94 bi_get_parent(bi_context
*ctx
, unsigned idx
)
96 bi_foreach_instr_global(ctx
, ins
) {
105 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
106 * currently O(nc) to the program and number of combines, so the pass becomes
107 * effectively O(n^2). Better bookkeeping would bring down to linear if that's
111 bi_rewrite_uses(bi_context
*ctx
,
112 unsigned old
, unsigned oldc
,
113 unsigned new, unsigned newc
)
115 bi_foreach_instr_global(ctx
, ins
) {
116 bi_foreach_src(ins
, s
) {
117 if (ins
->src
[s
] != old
) continue;
119 for (unsigned i
= 0; i
< 16; ++i
)
120 ins
->swizzle
[s
][i
] += (newc
- oldc
);
127 /* Checks if we have a nicely aligned vector prefix */
131 bi_is_aligned_vec32(bi_instruction
*combine
, unsigned s
, bi_instruction
*io
,
134 /* We only support prefixes */
138 if (!(bi_class_props
[io
->type
] & BI_VECTOR
))
141 if (nir_alu_type_get_type_size(combine
->dest_type
) != 32)
144 if (nir_alu_type_get_type_size(io
->dest_type
) != 32)
147 unsigned components
= io
->vector_channels
;
149 /* Are we contiguous like that? */
151 for (unsigned i
= 0; i
< components
; ++i
) {
152 if (combine
->src
[i
] != io
->dest
)
155 if (combine
->swizzle
[i
][0] != i
)
159 /* We're good to go */
164 /* Tries to lower a given source of a combine to an appropriate rewrite,
165 * returning true if successful, and false with no changes otherwise. */
168 bi_lower_combine_src(bi_context
*ctx
, bi_instruction
*ins
, unsigned s
, unsigned R
,
171 unsigned src
= ins
->src
[s
];
173 /* We currently only handle SSA */
175 if (!src
) return false;
176 if (src
& (BIR_SPECIAL
| PAN_IS_REG
)) return false;
178 /* We are SSA. Lookup the generating instruction. */
179 unsigned bytes
= nir_alu_type_get_type_size(ins
->dest_type
) / 8;
181 bi_instruction
*parent
= bi_get_parent(ctx
, src
,
182 0xF << (ins
->swizzle
[s
][0] * bytes
));
184 if (!parent
) return false;
186 /* We have a parent instuction, sanity check the typesize */
187 unsigned pbytes
= nir_alu_type_get_type_size(parent
->dest_type
) / 8;
188 if (pbytes
!= bytes
) return false;
190 bool scalar
= parent
->vector_channels
!= 0;
191 if (!(scalar
|| bi_is_aligned_vec(ins
, s
, parent
, vec_count
))) return false;
193 if (!bi_shift_mask(parent
, bytes
* s
)) return false;
194 bi_rewrite_uses(ctx
, parent
->dest
, 0, R
, s
);
201 bi_lower_combine(bi_context
*ctx
, bi_block
*block
)
203 bi_foreach_instr_in_block_safe(block
, ins
) {
204 if (ins
->type
!= BI_COMBINE
) continue;
206 bool needs_rewrite
= !(ins
->dest
& PAN_IS_REG
);
207 unsigned R
= needs_rewrite
? bi_make_temp_reg(ctx
) : ins
->dest
;
208 unsigned sz
= nir_alu_type_get_type_size(ins
->dest_type
);
210 bi_foreach_src(ins
, s
) {
211 /* We're done early for vec2/3 */
216 unsigned vec_count
= 0;
218 if (bi_lower_combine_src(ctx
, ins
, s
, R
, &vec_count
)) {
219 /* Skip vectored sources */
221 s
+= (vec_count
- 1);
223 bi_insert_combine_mov(ctx
, ins
, s
, R
);
227 bi_combine_mov32(ctx
, ins
, s
, R
);
229 bi_combine_sel16(ctx
, ins
, s
, R
);
232 unreachable("Unknown COMBINE size");
237 bi_rewrite_uses(ctx
, ins
->dest
, 0, R
, 0);
239 bi_remove_instruction(ins
);