pan/bi: Measure distance between blocks
[mesa.git] / src / panfrost / bifrost / bi_lower_combine.c
1 /*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25
26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
27 * BI_COMBINE instruction, e.g.:
28 *
29 * v = combine x, y, z, w
30 *
31 * These combines need to be lowered by the pass in this file. Fix a given
32 * source at component c.
33 *
34 * First suppose the source is SSA. If it is also scalar, then we may rewrite
35 * the destination of the generating instruction (unique by SSA+scalar) to
36 * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
37 * (the original by scalar). If it is vector, there are two cases. If the
38 * component c is `x`, we are accessing v.x, and each of the succeeding
39 * components y, z... up to the last component of the vector are accessed
40 * sequentially, then we may perform the same rewrite. If this is not the case,
41 * rewriting would require more complex vector features, so we fallback on a
42 * move.
43 *
44 * Otherwise is the source is not SSA, we also fallback on a move. We could
45 * probably do better.
46 */
47
48 static void
49 bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
50 {
51 bi_instruction move = {
52 .type = BI_MOV,
53 .dest = R,
54 .dest_type = nir_type_uint32,
55 .dest_offset = comp,
56 .src = { parent->src[comp] },
57 .src_types = { nir_type_uint32 },
58 .swizzle = { { parent->swizzle[comp][0] } }
59 };
60
61 bi_emit_before(ctx, parent, move);
62 }
63
64 static void
65 bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
66 {
67 bi_instruction sel = {
68 .type = BI_SELECT,
69 .dest = R,
70 .dest_type = nir_type_uint32,
71 .dest_offset = comp >> 1,
72 .src = { parent->src[comp], parent->src[comp + 1] },
73 .src_types = { nir_type_uint16, nir_type_uint16 },
74 .swizzle = { {
75 parent->swizzle[comp][0],
76 parent->swizzle[comp + 1][0],
77 } }
78 };
79
80 bi_emit_before(ctx, parent, sel);
81 }
82
83 /* Gets the instruction generating a given source. Combine lowering is
84 * accidentally O(n^2) right now because this function is O(n) instead of O(1).
85 * If this pass is slow, this cost can be avoided in favour for better
86 * bookkeeping. */
87
88 static bi_instruction *
89 bi_get_parent(bi_context *ctx, unsigned idx)
90 {
91 bi_foreach_instr_global(ctx, ins) {
92 if (ins->dest == idx)
93 return ins;
94 }
95
96 return NULL;
97 }
98
99 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
100 * currently O(nc) to the program and number of combines, so the pass becomes
101 * effectively O(n^2). Better bookkeeping would bring down to linear if that's
102 * an issue. */
103
104 static void
105 bi_rewrite_uses(bi_context *ctx,
106 unsigned old, unsigned oldc,
107 unsigned new, unsigned newc)
108 {
109 bi_foreach_instr_global(ctx, ins) {
110 bi_foreach_src(ins, s) {
111 if (ins->src[s] != old) continue;
112
113 for (unsigned i = 0; i < 16; ++i)
114 ins->swizzle[s][i] += (newc - oldc);
115
116 ins->src[s] = new;
117 }
118 }
119 }
120
121 /* Checks if we have a nicely aligned vector prefix */
122
123 static bool
124 bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
125 unsigned *count)
126 {
127 /* We only support prefixes */
128 if (s != 0)
129 return false;
130
131 if (!(bi_class_props[io->type] & BI_VECTOR))
132 return false;
133
134 if (nir_alu_type_get_type_size(combine->dest_type) != 32)
135 return false;
136
137 if (nir_alu_type_get_type_size(io->dest_type) != 32)
138 return false;
139
140 unsigned components = io->vector_channels;
141
142 /* Are we contiguous like that? */
143
144 for (unsigned i = 0; i < components; ++i) {
145 if (combine->src[i] != io->dest)
146 return false;
147
148 if (combine->swizzle[i][0] != i)
149 return false;
150 }
151
152 /* We're good to go */
153 *count = components;
154 return true;
155 }
156
157 #if 0
158 /* Tries to lower a given source of a combine to an appropriate rewrite,
159 * returning true if successful, and false with no changes otherwise. */
160
161 static bool
162 bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
163 unsigned *vec_count)
164 {
165 unsigned src = ins->src[s];
166
167 /* We currently only handle SSA */
168
169 if (!src) return false;
170 if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
171
172 /* We are SSA. Lookup the generating instruction. */
173 unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
174
175 bi_instruction *parent = bi_get_parent(ctx, src,
176 0xF << (ins->swizzle[s][0] * bytes));
177
178 if (!parent) return false;
179
180 /* We have a parent instuction, sanity check the typesize */
181 unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
182 if (pbytes != bytes) return false;
183
184 bool scalar = parent->vector_channels != 0;
185 if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
186
187 if (!bi_shift_mask(parent, bytes * s)) return false;
188 bi_rewrite_uses(ctx, parent->dest, 0, R, s);
189 parent->dest = R;
190 return true;
191 }
192 #endif
193
194 void
195 bi_lower_combine(bi_context *ctx, bi_block *block)
196 {
197 bi_foreach_instr_in_block_safe(block, ins) {
198 if (ins->type != BI_COMBINE) continue;
199
200 bool needs_rewrite = !(ins->dest & PAN_IS_REG);
201 unsigned R = needs_rewrite ? bi_make_temp_reg(ctx) : ins->dest;
202 unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
203
204 bi_foreach_src(ins, s) {
205 /* We're done early for vec2/3 */
206 if (!ins->src[s])
207 continue;
208
209 #if 0
210 unsigned vec_count = 0;
211
212 if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
213 /* Skip vectored sources */
214 if (vec_count)
215 s += (vec_count - 1);
216 } else {
217 bi_insert_combine_mov(ctx, ins, s, R);
218 }
219 #endif
220 if (sz == 32)
221 bi_combine_mov32(ctx, ins, s, R);
222 else if (sz == 16) {
223 bi_combine_sel16(ctx, ins, s, R);
224 s++;
225 } else {
226 unreachable("Unknown COMBINE size");
227 }
228 }
229
230 if (needs_rewrite)
231 bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
232
233 bi_remove_instruction(ins);
234 }
235 }