freedreno: a2xx: add ir2 copy propagation
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_private.h
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include <stdlib.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <assert.h>
32
33 #include "ir2.h"
34 #include "fd2_program.h"
35 #include "instr-a2xx.h"
36
37 enum ir2_src_type {
38 IR2_SRC_SSA,
39 IR2_SRC_REG,
40 IR2_SRC_INPUT,
41 IR2_SRC_CONST,
42 };
43
44 struct ir2_src {
45 /* num can mean different things
46 * ssa: index of instruction
47 * reg: index in ctx->reg array
48 * input: index in ctx->input array
49 * const: constant index (C0, C1, etc)
50 */
51 uint16_t num;
52 uint8_t swizzle;
53 enum ir2_src_type type : 2;
54 uint8_t abs : 1;
55 uint8_t negate : 1;
56 uint8_t : 4;
57 };
58
59 struct ir2_reg_component {
60 uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */
61 bool alloc : 1; /* is it currently allocated */
62 uint8_t ref_count; /* for ra */
63 };
64
65 struct ir2_reg {
66 uint8_t idx; /* assigned hardware register */
67 uint8_t ncomp;
68
69 uint8_t loop_depth;
70 bool initialized;
71 /* block_idx to free on (-1 = free on ref_count==0) */
72 int block_idx_free;
73 struct ir2_reg_component comp[4];
74 };
75
76 struct ir2_instr {
77 unsigned idx;
78
79 unsigned block_idx;
80
81 enum {
82 IR2_NONE,
83 IR2_FETCH,
84 IR2_ALU,
85 IR2_CF,
86 } type : 2;
87
88 /* instruction needs to be emitted (for scheduling) */
89 bool need_emit : 1;
90
91 /* predicate value - (usually) same for entire block */
92 uint8_t pred : 2;
93
94 /* src */
95 uint8_t src_count;
96 struct ir2_src src[4];
97
98 /* dst */
99 bool is_ssa;
100 union {
101 struct ir2_reg ssa;
102 struct ir2_reg *reg;
103 };
104
105 /* type-specific */
106 union {
107 struct {
108 instr_fetch_opc_t opc : 5;
109 union {
110 struct {
111 uint8_t const_idx;
112 uint8_t const_idx_sel;
113 } vtx;
114 struct {
115 bool is_cube : 1;
116 bool is_rect : 1;
117 uint8_t samp_id;
118 } tex;
119 };
120 } fetch;
121 struct {
122 /* store possible opcs, then we can choose vector/scalar instr */
123 instr_scalar_opc_t scalar_opc : 6;
124 instr_vector_opc_t vector_opc : 5;
125 /* same as nir */
126 uint8_t write_mask : 4;
127 bool saturate : 1;
128
129 /* export idx (-1 no export) */
130 int8_t export;
131
132 /* for scalarized 2 src instruction */
133 uint8_t src1_swizzle;
134 } alu;
135 struct {
136 /* jmp dst block_idx */
137 uint8_t block_idx;
138 } cf;
139 };
140 };
141
142 struct ir2_sched_instr {
143 uint32_t reg_state[8];
144 struct ir2_instr *instr, *instr_s;
145 };
146
147 struct ir2_context {
148 struct fd2_shader_stateobj *so;
149
150 unsigned block_idx, pred_idx;
151 uint8_t pred;
152 bool block_has_jump[64];
153
154 unsigned loop_last_block[64];
155 unsigned loop_depth;
156
157 nir_shader *nir;
158
159 /* ssa index of position output */
160 struct ir2_src position;
161
162 /* to translate SSA ids to instruction ids */
163 int16_t ssa_map[1024];
164
165 struct ir2_shader_info *info;
166 struct ir2_frag_linkage *f;
167
168 int prev_export;
169
170 /* RA state */
171 struct ir2_reg* live_regs[64];
172 uint32_t reg_state[256/32]; /* 64*4 bits */
173
174 /* inputs */
175 struct ir2_reg input[16 + 1]; /* 16 + param */
176
177 /* non-ssa regs */
178 struct ir2_reg reg[64];
179 unsigned reg_count;
180
181 struct ir2_instr instr[0x300];
182 unsigned instr_count;
183
184 struct ir2_sched_instr instr_sched[0x180];
185 unsigned instr_sched_count;
186 };
187
188 void assemble(struct ir2_context *ctx, bool binning);
189
190 void ir2_nir_compile(struct ir2_context *ctx, bool binning);
191
192 void ra_count_refs(struct ir2_context *ctx);
193 void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx,
194 bool export, uint8_t export_writemask);
195 void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr);
196 void ra_block_free(struct ir2_context *ctx, unsigned block);
197
198 void cp_src(struct ir2_context *ctx);
199 void cp_export(struct ir2_context *ctx);
200
201 /* utils */
202 enum {
203 IR2_SWIZZLE_Y = 1 << 0,
204 IR2_SWIZZLE_Z = 2 << 0,
205 IR2_SWIZZLE_W = 3 << 0,
206
207 IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2,
208
209 IR2_SWIZZLE_XYW = 0 << 0 | 0 << 2 | 1 << 4,
210
211 IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6,
212 IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6,
213 IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6,
214 IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6,
215 IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6,
216 IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6,
217 IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6,
218 IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6,
219 };
220
221 #define compile_error(ctx, args...) ({ \
222 printf(args); \
223 assert(0); \
224 })
225
226 static inline struct ir2_src
227 ir2_src(uint16_t num, uint8_t swizzle, enum ir2_src_type type)
228 {
229 return (struct ir2_src) {
230 .num = num,
231 .swizzle = swizzle,
232 .type = type
233 };
234 }
235
236 /* ir2_assemble uses it .. */
237 struct ir2_src ir2_zero(struct ir2_context *ctx);
238
239 #define ir2_foreach_instr(it, ctx) \
240 for (struct ir2_instr *it = (ctx)->instr; ({ \
241 while (it != &(ctx)->instr[(ctx)->instr_count] && it->type == IR2_NONE) it++; \
242 it != &(ctx)->instr[(ctx)->instr_count]; }); it++)
243
244 #define ir2_foreach_live_reg(it, ctx) \
245 for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \
246 while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) __ptr++; \
247 __ptr != &(ctx)->live_regs[64] ? (it=*__ptr) : NULL; }); it++)
248
249 #define ir2_foreach_avail(it) \
250 for (struct ir2_instr **__instrp = avail, *it; \
251 it = *__instrp, __instrp != &avail[avail_count]; __instrp++)
252
253 #define ir2_foreach_src(it, instr) \
254 for (struct ir2_src *it = instr->src; \
255 it != &instr->src[instr->src_count]; it++)
256
257 /* mask for register allocation
258 * 64 registers with 4 components each = 256 bits
259 */
260 /* typedef struct {
261 uint64_t data[4];
262 } regmask_t; */
263
264 static inline bool mask_isset(uint32_t * mask, unsigned num)
265 {
266 return ! !(mask[num / 32] & 1 << num % 32);
267 }
268
269 static inline void mask_set(uint32_t * mask, unsigned num)
270 {
271 mask[num / 32] |= 1 << num % 32;
272 }
273
274 static inline void mask_unset(uint32_t * mask, unsigned num)
275 {
276 mask[num / 32] &= ~(1 << num % 32);
277 }
278
279 static inline unsigned mask_reg(uint32_t * mask, unsigned num)
280 {
281 return mask[num / 8] >> num % 8 * 4 & 0xf;
282 }
283
284 static inline bool is_export(struct ir2_instr *instr)
285 {
286 return instr->type == IR2_ALU && instr->alu.export >= 0;
287 }
288
289 static inline instr_alloc_type_t export_buf(unsigned num)
290 {
291 return num < 32 ? SQ_PARAMETER_PIXEL :
292 num >= 62 ? SQ_POSITION : SQ_MEMORY;
293 }
294
295 /* component c for channel i */
296 static inline unsigned swiz_set(unsigned c, unsigned i)
297 {
298 return ((c - i) & 3) << i * 2;
299 }
300
301 /* get swizzle in channel i */
302 static inline unsigned swiz_get(unsigned swiz, unsigned i)
303 {
304 return ((swiz >> i * 2) + i) & 3;
305 }
306
307 static inline unsigned swiz_merge(unsigned swiz0, unsigned swiz1)
308 {
309 unsigned swiz = 0;
310 for (int i = 0; i < 4; i++)
311 swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i);
312 return swiz;
313 }
314
315 static inline void swiz_merge_p(uint8_t *swiz0, unsigned swiz1)
316 {
317 unsigned swiz = 0;
318 for (int i = 0; i < 4; i++)
319 swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i);
320 *swiz0 = swiz;
321 }
322
323 static inline struct ir2_reg * get_reg(struct ir2_instr *instr)
324 {
325 return instr->is_ssa ? &instr->ssa : instr->reg;
326 }
327
328 static inline struct ir2_reg *
329 get_reg_src(struct ir2_context *ctx, struct ir2_src *src)
330 {
331 switch (src->type) {
332 case IR2_SRC_INPUT:
333 return &ctx->input[src->num];
334 case IR2_SRC_SSA:
335 return &ctx->instr[src->num].ssa;
336 case IR2_SRC_REG:
337 return &ctx->reg[src->num];
338 default:
339 return NULL;
340 }
341 }
342
343 /* gets a ncomp value for the dst */
344 static inline unsigned dst_ncomp(struct ir2_instr *instr)
345 {
346 if (instr->is_ssa)
347 return instr->ssa.ncomp;
348
349 if (instr->type == IR2_FETCH)
350 return instr->reg->ncomp;
351
352 assert(instr->type == IR2_ALU);
353
354 unsigned ncomp = 0;
355 for (int i = 0; i < instr->reg->ncomp; i++)
356 ncomp += !!(instr->alu.write_mask & 1 << i);
357 return ncomp;
358 }
359
360 /* gets a ncomp value for the src registers */
361 static inline unsigned src_ncomp(struct ir2_instr *instr)
362 {
363 if (instr->type == IR2_FETCH) {
364 switch (instr->fetch.opc) {
365 case VTX_FETCH:
366 return 1;
367 case TEX_FETCH:
368 return instr->fetch.tex.is_cube ? 3 : 2;
369 case TEX_SET_TEX_LOD:
370 return 1;
371 default:
372 assert(0);
373 }
374 }
375
376 switch (instr->alu.scalar_opc) {
377 case PRED_SETEs ... KILLONEs:
378 return 1;
379 default:
380 break;
381 }
382
383 switch (instr->alu.vector_opc) {
384 case DOT2ADDv:
385 return 2;
386 case DOT3v:
387 return 3;
388 case DOT4v:
389 case CUBEv:
390 case PRED_SETE_PUSHv:
391 return 4;
392 default:
393 return dst_ncomp(instr);
394 }
395 }