bd0c0a5b69344c557b4b0aa44d758a2527594104
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3.h
1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdint.h>
28 #include <stdbool.h>
29
30 #include "util/u_debug.h"
31
32 #include "instr-a3xx.h"
33 #include "disasm.h" /* TODO move 'enum shader_t' somewhere else.. */
34
35 /* low level intermediate representation of an adreno shader program */
36
37 struct ir3;
38 struct ir3_instruction;
39 struct ir3_block;
40
41 struct ir3 * fd_asm_parse(const char *src);
42
43 struct ir3_info {
44 uint16_t sizedwords;
45 uint16_t instrs_count; /* expanded to account for rpt's */
46 /* NOTE: max_reg, etc, does not include registers not touched
47 * by the shader (ie. vertex fetched via VFD_DECODE but not
48 * touched by shader)
49 */
50 int8_t max_reg; /* highest GPR # used by shader */
51 int8_t max_half_reg;
52 int16_t max_const;
53 };
54
55 struct ir3_register {
56 enum {
57 IR3_REG_CONST = 0x001,
58 IR3_REG_IMMED = 0x002,
59 IR3_REG_HALF = 0x004,
60 IR3_REG_RELATIV= 0x008,
61 IR3_REG_R = 0x010,
62 IR3_REG_NEGATE = 0x020,
63 IR3_REG_ABS = 0x040,
64 IR3_REG_EVEN = 0x080,
65 IR3_REG_POS_INF= 0x100,
66 /* (ei) flag, end-input? Set on last bary, presumably to signal
67 * that the shader needs no more input:
68 */
69 IR3_REG_EI = 0x200,
70 /* meta-flags, for intermediate stages of IR, ie.
71 * before register assignment is done:
72 */
73 IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */
74 IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */
75 IR3_REG_ADDR = 0x4000, /* register is a0.x */
76 } flags;
77 union {
78 /* normal registers:
79 * the component is in the low two bits of the reg #, so
80 * rN.x becomes: (N << 2) | x
81 */
82 int num;
83 /* immediate: */
84 int iim_val;
85 float fim_val;
86 /* relative: */
87 int offset;
88 /* for IR3_REG_SSA, src registers contain ptr back to
89 * assigning instruction.
90 */
91 struct ir3_instruction *instr;
92 };
93
94 union {
95 /* used for cat5 instructions, but also for internal/IR level
96 * tracking of what registers are read/written by an instruction.
97 * wrmask may be a bad name since it is used to represent both
98 * src and dst that touch multiple adjacent registers.
99 */
100 unsigned wrmask;
101 /* for relative addressing, 32bits for array size is too small,
102 * but otoh we don't need to deal with disjoint sets, so instead
103 * use a simple size field (number of scalar components).
104 */
105 unsigned size;
106 };
107 };
108
109 #define IR3_INSTR_SRCS 10
110
111 struct ir3_instruction {
112 struct ir3_block *block;
113 int category;
114 opc_t opc;
115 enum {
116 /* (sy) flag is set on first instruction, and after sample
117 * instructions (probably just on RAW hazard).
118 */
119 IR3_INSTR_SY = 0x001,
120 /* (ss) flag is set on first instruction, and first instruction
121 * to depend on the result of "long" instructions (RAW hazard):
122 *
123 * rcp, rsq, log2, exp2, sin, cos, sqrt
124 *
125 * It seems to synchronize until all in-flight instructions are
126 * completed, for example:
127 *
128 * rsq hr1.w, hr1.w
129 * add.f hr2.z, (neg)hr2.z, hc0.y
130 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
131 * rsq hr2.x, hr2.x
132 * (rpt1)nop
133 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
134 * nop
135 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
136 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
137 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
138 *
139 * The last mul.f does not have (ss) set, presumably because the
140 * (ss) on the previous instruction does the job.
141 *
142 * The blob driver also seems to set it on WAR hazards, although
143 * not really clear if this is needed or just blob compiler being
144 * sloppy. So far I haven't found a case where removing the (ss)
145 * causes problems for WAR hazard, but I could just be getting
146 * lucky:
147 *
148 * rcp r1.y, r3.y
149 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
150 *
151 */
152 IR3_INSTR_SS = 0x002,
153 /* (jp) flag is set on jump targets:
154 */
155 IR3_INSTR_JP = 0x004,
156 IR3_INSTR_UL = 0x008,
157 IR3_INSTR_3D = 0x010,
158 IR3_INSTR_A = 0x020,
159 IR3_INSTR_O = 0x040,
160 IR3_INSTR_P = 0x080,
161 IR3_INSTR_S = 0x100,
162 IR3_INSTR_S2EN = 0x200,
163 /* meta-flags, for intermediate stages of IR, ie.
164 * before register assignment is done:
165 */
166 IR3_INSTR_MARK = 0x1000,
167 } flags;
168 int repeat;
169 unsigned regs_count;
170 struct ir3_register *regs[1 + IR3_INSTR_SRCS];
171 union {
172 struct {
173 char inv;
174 char comp;
175 int immed;
176 } cat0;
177 struct {
178 type_t src_type, dst_type;
179 } cat1;
180 struct {
181 enum {
182 IR3_COND_LT = 0,
183 IR3_COND_LE = 1,
184 IR3_COND_GT = 2,
185 IR3_COND_GE = 3,
186 IR3_COND_EQ = 4,
187 IR3_COND_NE = 5,
188 } condition;
189 } cat2;
190 struct {
191 unsigned samp, tex;
192 type_t type;
193 } cat5;
194 struct {
195 type_t type;
196 int offset;
197 int iim_val;
198 } cat6;
199 /* for meta-instructions, just used to hold extra data
200 * before instruction scheduling, etc
201 */
202 struct {
203 int off; /* component/offset */
204 } fo;
205 struct {
206 struct ir3_block *if_block, *else_block;
207 } flow;
208 struct {
209 struct ir3_block *block;
210 } inout;
211
212 /* XXX keep this as big as all other union members! */
213 uint32_t info[3];
214 };
215
216 /* transient values used during various algorithms: */
217 union {
218 /* The instruction depth is the max dependency distance to output.
219 *
220 * You can also think of it as the "cost", if we did any sort of
221 * optimization for register footprint. Ie. a value that is just
222 * result of moving a const to a reg would have a low cost, so to
223 * it could make sense to duplicate the instruction at various
224 * points where the result is needed to reduce register footprint.
225 *
226 * DEPTH_UNUSED used to mark unused instructions after depth
227 * calculation pass.
228 */
229 #define DEPTH_UNUSED ~0
230 unsigned depth;
231
232 /* Used just during cp stage, which comes before depth pass.
233 * For fanin, where we need a sequence of consecutive registers,
234 * keep track of each src instructions left (ie 'n-1') and right
235 * (ie 'n+1') neighbor. The front-end must insert enough mov's
236 * to ensure that each instruction has at most one left and at
237 * most one right neighbor. During the copy-propagation pass,
238 * we only remove mov's when we can preserve this constraint.
239 */
240 struct {
241 struct ir3_instruction *left, *right;
242 uint16_t left_cnt, right_cnt;
243 } cp;
244 };
245 struct ir3_instruction *next;
246 #ifdef DEBUG
247 uint32_t serialno;
248 #endif
249 };
250
251 struct ir3_heap_chunk;
252
253 struct ir3 {
254 unsigned instrs_count, instrs_sz;
255 struct ir3_instruction **instrs;
256 unsigned baryfs_count, baryfs_sz;
257 struct ir3_instruction **baryfs;
258 struct ir3_block *block;
259 unsigned heap_idx;
260 struct ir3_heap_chunk *chunk;
261 };
262
263 struct ir3_block {
264 struct ir3 *shader;
265 unsigned ntemporaries, ninputs, noutputs;
266 /* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
267 struct ir3_instruction **temporaries;
268 struct ir3_instruction **inputs;
269 struct ir3_instruction **outputs;
270 /* only a single address register: */
271 struct ir3_instruction *address;
272 struct ir3_block *parent;
273 struct ir3_instruction *head;
274 };
275
276 struct ir3 * ir3_create(void);
277 void ir3_destroy(struct ir3 *shader);
278 void * ir3_assemble(struct ir3 *shader,
279 struct ir3_info *info, uint32_t gpu_id);
280 void * ir3_alloc(struct ir3 *shader, int sz);
281
282 struct ir3_block * ir3_block_create(struct ir3 *shader,
283 unsigned ntmp, unsigned nin, unsigned nout);
284
285 struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
286 int category, opc_t opc);
287 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
288 const char *ir3_instr_name(struct ir3_instruction *instr);
289
290 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
291 int num, int flags);
292
293
294 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
295 {
296 if (instr->flags & IR3_INSTR_MARK)
297 return true; /* already visited */
298 instr->flags |= IR3_INSTR_MARK;
299 return false;
300 }
301
302 static inline void ir3_clear_mark(struct ir3 *shader)
303 {
304 /* TODO would be nice to drop the instruction array.. for
305 * new compiler, _clear_mark() is all we use it for, and
306 * we could probably manage a linked list instead..
307 *
308 * Also, we'll probably want to mark instructions within
309 * a block, so tracking the list of instrs globally is
310 * unlikely to be what we want.
311 */
312 unsigned i;
313 for (i = 0; i < shader->instrs_count; i++) {
314 struct ir3_instruction *instr = shader->instrs[i];
315 instr->flags &= ~IR3_INSTR_MARK;
316 }
317 }
318
319 static inline int ir3_instr_regno(struct ir3_instruction *instr,
320 struct ir3_register *reg)
321 {
322 unsigned i;
323 for (i = 0; i < instr->regs_count; i++)
324 if (reg == instr->regs[i])
325 return i;
326 return -1;
327 }
328
329
330 /* comp:
331 * 0 - x
332 * 1 - y
333 * 2 - z
334 * 3 - w
335 */
336 static inline uint32_t regid(int num, int comp)
337 {
338 return (num << 2) | (comp & 0x3);
339 }
340
341 static inline uint32_t reg_num(struct ir3_register *reg)
342 {
343 return reg->num >> 2;
344 }
345
346 static inline uint32_t reg_comp(struct ir3_register *reg)
347 {
348 return reg->num & 0x3;
349 }
350
351 static inline bool is_flow(struct ir3_instruction *instr)
352 {
353 return (instr->category == 0);
354 }
355
356 static inline bool is_kill(struct ir3_instruction *instr)
357 {
358 return is_flow(instr) && (instr->opc == OPC_KILL);
359 }
360
361 static inline bool is_nop(struct ir3_instruction *instr)
362 {
363 return is_flow(instr) && (instr->opc == OPC_NOP);
364 }
365
366 static inline bool is_alu(struct ir3_instruction *instr)
367 {
368 return (1 <= instr->category) && (instr->category <= 3);
369 }
370
371 static inline bool is_sfu(struct ir3_instruction *instr)
372 {
373 return (instr->category == 4);
374 }
375
376 static inline bool is_tex(struct ir3_instruction *instr)
377 {
378 return (instr->category == 5);
379 }
380
381 static inline bool is_input(struct ir3_instruction *instr)
382 {
383 return (instr->category == 2) && (instr->opc == OPC_BARY_F);
384 }
385
386 static inline bool is_meta(struct ir3_instruction *instr)
387 {
388 /* TODO how should we count PHI (and maybe fan-in/out) which
389 * might actually contribute some instructions to the final
390 * result?
391 */
392 return (instr->category == -1);
393 }
394
395 static inline bool is_addr(struct ir3_instruction *instr)
396 {
397 return is_meta(instr) && (instr->opc == OPC_META_DEREF);
398 }
399
400 static inline bool writes_addr(struct ir3_instruction *instr)
401 {
402 if (instr->regs_count > 0) {
403 struct ir3_register *dst = instr->regs[0];
404 return !!(dst->flags & IR3_REG_ADDR);
405 }
406 return false;
407 }
408
409 static inline bool writes_pred(struct ir3_instruction *instr)
410 {
411 if (instr->regs_count > 0) {
412 struct ir3_register *dst = instr->regs[0];
413 return reg_num(dst) == REG_P0;
414 }
415 return false;
416 }
417
418 static inline bool reg_gpr(struct ir3_register *r)
419 {
420 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_ADDR))
421 return false;
422 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
423 return false;
424 return true;
425 }
426
427 /* dump: */
428 #include <stdio.h>
429 void ir3_dump(struct ir3 *shader, const char *name,
430 struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
431 FILE *f);
432 void ir3_dump_instr_single(struct ir3_instruction *instr);
433 void ir3_dump_instr_list(struct ir3_instruction *instr);
434
435 /* flatten if/else: */
436 int ir3_block_flatten(struct ir3_block *block);
437
438 /* depth calculation: */
439 int ir3_delayslots(struct ir3_instruction *assigner,
440 struct ir3_instruction *consumer, unsigned n);
441 void ir3_block_depth(struct ir3_block *block);
442
443 /* copy-propagate: */
444 void ir3_block_cp(struct ir3_block *block);
445
446 /* scheduling: */
447 int ir3_block_sched(struct ir3_block *block);
448
449 /* register assignment: */
450 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
451 bool half_precision, bool frag_coord, bool frag_face);
452
453 /* legalize: */
454 void ir3_block_legalize(struct ir3_block *block,
455 bool *has_samp, int *max_bary);
456
457 #ifndef ARRAY_SIZE
458 # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
459 #endif
460
461 /* ************************************************************************* */
462 /* split this out or find some helper to use.. like main/bitset.h.. */
463
464 #include <string.h>
465
466 #define MAX_REG 256
467
468 typedef uint8_t regmask_t[2 * MAX_REG / 8];
469
470 static inline unsigned regmask_idx(struct ir3_register *reg)
471 {
472 unsigned num = reg->num;
473 debug_assert(num < MAX_REG);
474 if (reg->flags & IR3_REG_HALF)
475 num += MAX_REG;
476 return num;
477 }
478
479 static inline void regmask_init(regmask_t *regmask)
480 {
481 memset(regmask, 0, sizeof(*regmask));
482 }
483
484 static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
485 {
486 unsigned idx = regmask_idx(reg);
487 if (reg->flags & IR3_REG_RELATIV) {
488 unsigned i;
489 for (i = 0; i < reg->size; i++, idx++)
490 (*regmask)[idx / 8] |= 1 << (idx % 8);
491 } else {
492 unsigned mask;
493 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
494 if (mask & 1)
495 (*regmask)[idx / 8] |= 1 << (idx % 8);
496 }
497 }
498
499 static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
500 {
501 unsigned i;
502 for (i = 0; i < ARRAY_SIZE(*dst); i++)
503 (*dst)[i] = (*a)[i] | (*b)[i];
504 }
505
506 /* set bits in a if not set in b, conceptually:
507 * a |= (reg & ~b)
508 */
509 static inline void regmask_set_if_not(regmask_t *a,
510 struct ir3_register *reg, regmask_t *b)
511 {
512 unsigned idx = regmask_idx(reg);
513 if (reg->flags & IR3_REG_RELATIV) {
514 unsigned i;
515 for (i = 0; i < reg->size; i++, idx++)
516 if (!((*b)[idx / 8] & (1 << (idx % 8))))
517 (*a)[idx / 8] |= 1 << (idx % 8);
518 } else {
519 unsigned mask;
520 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
521 if (mask & 1)
522 if (!((*b)[idx / 8] & (1 << (idx % 8))))
523 (*a)[idx / 8] |= 1 << (idx % 8);
524 }
525 }
526
527 static inline bool regmask_get(regmask_t *regmask,
528 struct ir3_register *reg)
529 {
530 unsigned idx = regmask_idx(reg);
531 if (reg->flags & IR3_REG_RELATIV) {
532 unsigned i;
533 for (i = 0; i < reg->size; i++, idx++)
534 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
535 return true;
536 } else {
537 unsigned mask;
538 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
539 if (mask & 1)
540 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
541 return true;
542 }
543 return false;
544 }
545
546 /* ************************************************************************* */
547
548 #endif /* IR3_H_ */