freedreno/ir3: remove unused helper
[mesa.git] / src / freedreno / ir3 / ir3.h
1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdint.h>
28 #include <stdbool.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/set.h"
35 #include "util/u_debug.h"
36
37 #include "instr-a3xx.h"
38
39 /* low level intermediate representation of an adreno shader program */
40
41 struct ir3_compiler;
42 struct ir3;
43 struct ir3_instruction;
44 struct ir3_block;
45
46 struct ir3_info {
47 uint32_t gpu_id;
48 uint16_t sizedwords;
49 uint16_t instrs_count; /* expanded to account for rpt's */
50 uint16_t nops_count; /* # of nop instructions, including nopN */
51 /* NOTE: max_reg, etc, does not include registers not touched
52 * by the shader (ie. vertex fetched via VFD_DECODE but not
53 * touched by shader)
54 */
55 int8_t max_reg; /* highest GPR # used by shader */
56 int8_t max_half_reg;
57 int16_t max_const;
58
59 /* number of sync bits: */
60 uint16_t ss, sy;
61
62 /* estimate of number of cycles stalled on (ss) */
63 uint16_t sstall;
64
65 uint16_t last_baryf; /* instruction # of last varying fetch */
66 };
67
68 struct ir3_register {
69 enum {
70 IR3_REG_CONST = 0x001,
71 IR3_REG_IMMED = 0x002,
72 IR3_REG_HALF = 0x004,
73 /* high registers are used for some things in compute shaders,
74 * for example. Seems to be for things that are global to all
75 * threads in a wave, so possibly these are global/shared by
76 * all the threads in the wave?
77 */
78 IR3_REG_HIGH = 0x008,
79 IR3_REG_RELATIV= 0x010,
80 IR3_REG_R = 0x020,
81 /* Most instructions, it seems, can do float abs/neg but not
82 * integer. The CP pass needs to know what is intended (int or
83 * float) in order to do the right thing. For this reason the
84 * abs/neg flags are split out into float and int variants. In
85 * addition, .b (bitwise) operations, the negate is actually a
86 * bitwise not, so split that out into a new flag to make it
87 * more clear.
88 */
89 IR3_REG_FNEG = 0x040,
90 IR3_REG_FABS = 0x080,
91 IR3_REG_SNEG = 0x100,
92 IR3_REG_SABS = 0x200,
93 IR3_REG_BNOT = 0x400,
94 IR3_REG_EVEN = 0x800,
95 IR3_REG_POS_INF= 0x1000,
96 /* (ei) flag, end-input? Set on last bary, presumably to signal
97 * that the shader needs no more input:
98 */
99 IR3_REG_EI = 0x2000,
100 /* meta-flags, for intermediate stages of IR, ie.
101 * before register assignment is done:
102 */
103 IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */
104 IR3_REG_ARRAY = 0x8000,
105
106 } flags;
107
108 /* used for cat5 instructions, but also for internal/IR level
109 * tracking of what registers are read/written by an instruction.
110 * wrmask may be a bad name since it is used to represent both
111 * src and dst that touch multiple adjacent registers.
112 */
113 unsigned wrmask : 16; /* up to vec16 */
114
115 /* for relative addressing, 32bits for array size is too small,
116 * but otoh we don't need to deal with disjoint sets, so instead
117 * use a simple size field (number of scalar components).
118 *
119 * Note the size field isn't important for relative const (since
120 * we don't have to do register allocation for constants).
121 */
122 unsigned size : 15;
123
124 bool merged : 1; /* half-regs conflict with full regs (ie >= a6xx) */
125
126 /* normal registers:
127 * the component is in the low two bits of the reg #, so
128 * rN.x becomes: (N << 2) | x
129 */
130 uint16_t num;
131 union {
132 /* immediate: */
133 int32_t iim_val;
134 uint32_t uim_val;
135 float fim_val;
136 /* relative: */
137 struct {
138 uint16_t id;
139 int16_t offset;
140 } array;
141 };
142
143 /* For IR3_REG_SSA, src registers contain ptr back to assigning
144 * instruction.
145 *
146 * For IR3_REG_ARRAY, the pointer is back to the last dependent
147 * array access (although the net effect is the same, it points
148 * back to a previous instruction that we depend on).
149 */
150 struct ir3_instruction *instr;
151 };
152
153 /*
154 * Stupid/simple growable array implementation:
155 */
156 #define DECLARE_ARRAY(type, name) \
157 unsigned name ## _count, name ## _sz; \
158 type * name;
159
160 #define array_insert(ctx, arr, val) do { \
161 if (arr ## _count == arr ## _sz) { \
162 arr ## _sz = MAX2(2 * arr ## _sz, 16); \
163 arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
164 } \
165 arr[arr ##_count++] = val; \
166 } while (0)
167
168 struct ir3_instruction {
169 struct ir3_block *block;
170 opc_t opc;
171 enum {
172 /* (sy) flag is set on first instruction, and after sample
173 * instructions (probably just on RAW hazard).
174 */
175 IR3_INSTR_SY = 0x001,
176 /* (ss) flag is set on first instruction, and first instruction
177 * to depend on the result of "long" instructions (RAW hazard):
178 *
179 * rcp, rsq, log2, exp2, sin, cos, sqrt
180 *
181 * It seems to synchronize until all in-flight instructions are
182 * completed, for example:
183 *
184 * rsq hr1.w, hr1.w
185 * add.f hr2.z, (neg)hr2.z, hc0.y
186 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
187 * rsq hr2.x, hr2.x
188 * (rpt1)nop
189 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
190 * nop
191 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
192 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
193 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
194 *
195 * The last mul.f does not have (ss) set, presumably because the
196 * (ss) on the previous instruction does the job.
197 *
198 * The blob driver also seems to set it on WAR hazards, although
199 * not really clear if this is needed or just blob compiler being
200 * sloppy. So far I haven't found a case where removing the (ss)
201 * causes problems for WAR hazard, but I could just be getting
202 * lucky:
203 *
204 * rcp r1.y, r3.y
205 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
206 *
207 */
208 IR3_INSTR_SS = 0x002,
209 /* (jp) flag is set on jump targets:
210 */
211 IR3_INSTR_JP = 0x004,
212 IR3_INSTR_UL = 0x008,
213 IR3_INSTR_3D = 0x010,
214 IR3_INSTR_A = 0x020,
215 IR3_INSTR_O = 0x040,
216 IR3_INSTR_P = 0x080,
217 IR3_INSTR_S = 0x100,
218 IR3_INSTR_S2EN = 0x200,
219 IR3_INSTR_G = 0x400,
220 IR3_INSTR_SAT = 0x800,
221 /* meta-flags, for intermediate stages of IR, ie.
222 * before register assignment is done:
223 */
224 IR3_INSTR_MARK = 0x1000,
225 IR3_INSTR_UNUSED= 0x2000,
226 } flags;
227 uint8_t repeat;
228 uint8_t nop;
229 #ifdef DEBUG
230 unsigned regs_max;
231 #endif
232 unsigned regs_count;
233 struct ir3_register **regs;
234 union {
235 struct {
236 char inv;
237 char comp;
238 int immed;
239 struct ir3_block *target;
240 } cat0;
241 struct {
242 type_t src_type, dst_type;
243 } cat1;
244 struct {
245 enum {
246 IR3_COND_LT = 0,
247 IR3_COND_LE = 1,
248 IR3_COND_GT = 2,
249 IR3_COND_GE = 3,
250 IR3_COND_EQ = 4,
251 IR3_COND_NE = 5,
252 } condition;
253 } cat2;
254 struct {
255 unsigned samp, tex;
256 type_t type;
257 } cat5;
258 struct {
259 type_t type;
260 int src_offset;
261 int dst_offset;
262 int iim_val : 3; /* for ldgb/stgb, # of components */
263 unsigned d : 3;
264 bool typed : 1;
265 } cat6;
266 struct {
267 unsigned w : 1; /* write */
268 unsigned r : 1; /* read */
269 unsigned l : 1; /* local */
270 unsigned g : 1; /* global */
271 } cat7;
272 /* for meta-instructions, just used to hold extra data
273 * before instruction scheduling, etc
274 */
275 struct {
276 int off; /* component/offset */
277 } split;
278 struct {
279 /* for output collects, this maps back to the entry in the
280 * ir3_shader_variant::outputs table.
281 */
282 int outidx;
283 } collect;
284 struct {
285 unsigned samp, tex;
286 unsigned input_offset;
287 } prefetch;
288 struct {
289 /* maps back to entry in ir3_shader_variant::inputs table: */
290 int inidx;
291 /* for sysvals, identifies the sysval type. Mostly so we can
292 * identify the special cases where a sysval should not be DCE'd
293 * (currently, just pre-fs texture fetch)
294 */
295 gl_system_value sysval;
296 } input;
297 };
298
299 /* transient values used during various algorithms: */
300 union {
301 /* The instruction depth is the max dependency distance to output.
302 *
303 * You can also think of it as the "cost", if we did any sort of
304 * optimization for register footprint. Ie. a value that is just
305 * result of moving a const to a reg would have a low cost, so to
306 * it could make sense to duplicate the instruction at various
307 * points where the result is needed to reduce register footprint.
308 */
309 int depth;
310 /* When we get to the RA stage, we no longer need depth, but
311 * we do need instruction's position/name:
312 */
313 struct {
314 uint16_t ip;
315 uint16_t name;
316 };
317 };
318
319 /* used for per-pass extra instruction data.
320 *
321 * TODO we should remove the per-pass data like this and 'use_count'
322 * and do something similar to what RA does w/ ir3_ra_instr_data..
323 * ie. use the ir3_count_instructions pass, and then use instr->ip
324 * to index into a table of pass-private data.
325 */
326 void *data;
327
328 int sun; /* Sethi–Ullman number, used by sched */
329 int use_count; /* currently just updated/used by cp */
330
331 /* Used during CP and RA stages. For collect and shader inputs/
332 * outputs where we need a sequence of consecutive registers,
333 * keep track of each src instructions left (ie 'n-1') and right
334 * (ie 'n+1') neighbor. The front-end must insert enough mov's
335 * to ensure that each instruction has at most one left and at
336 * most one right neighbor. During the copy-propagation pass,
337 * we only remove mov's when we can preserve this constraint.
338 * And during the RA stage, we use the neighbor information to
339 * allocate a block of registers in one shot.
340 *
341 * TODO: maybe just add something like:
342 * struct ir3_instruction_ref {
343 * struct ir3_instruction *instr;
344 * unsigned cnt;
345 * }
346 *
347 * Or can we get away without the refcnt stuff? It seems like
348 * it should be overkill.. the problem is if, potentially after
349 * already eliminating some mov's, if you have a single mov that
350 * needs to be grouped with it's neighbors in two different
351 * places (ex. shader output and a collect).
352 */
353 struct {
354 struct ir3_instruction *left, *right;
355 uint16_t left_cnt, right_cnt;
356 } cp;
357
358 /* an instruction can reference at most one address register amongst
359 * it's src/dst registers. Beyond that, you need to insert mov's.
360 *
361 * NOTE: do not write this directly, use ir3_instr_set_address()
362 */
363 struct ir3_instruction *address;
364
365 /* Tracking for additional dependent instructions. Used to handle
366 * barriers, WAR hazards for arrays/SSBOs/etc.
367 */
368 DECLARE_ARRAY(struct ir3_instruction *, deps);
369
370 /*
371 * From PoV of instruction scheduling, not execution (ie. ignores global/
372 * local distinction):
373 * shared image atomic SSBO everything
374 * barrier()/ - R/W R/W R/W R/W X
375 * groupMemoryBarrier()
376 * memoryBarrier() - R/W R/W
377 * (but only images declared coherent?)
378 * memoryBarrierAtomic() - R/W
379 * memoryBarrierBuffer() - R/W
380 * memoryBarrierImage() - R/W
381 * memoryBarrierShared() - R/W
382 *
383 * TODO I think for SSBO/image/shared, in cases where we can determine
384 * which variable is accessed, we don't need to care about accesses to
385 * different variables (unless declared coherent??)
386 */
387 enum {
388 IR3_BARRIER_EVERYTHING = 1 << 0,
389 IR3_BARRIER_SHARED_R = 1 << 1,
390 IR3_BARRIER_SHARED_W = 1 << 2,
391 IR3_BARRIER_IMAGE_R = 1 << 3,
392 IR3_BARRIER_IMAGE_W = 1 << 4,
393 IR3_BARRIER_BUFFER_R = 1 << 5,
394 IR3_BARRIER_BUFFER_W = 1 << 6,
395 IR3_BARRIER_ARRAY_R = 1 << 7,
396 IR3_BARRIER_ARRAY_W = 1 << 8,
397 } barrier_class, barrier_conflict;
398
399 /* Entry in ir3_block's instruction list: */
400 struct list_head node;
401
402 #ifdef DEBUG
403 uint32_t serialno;
404 #endif
405
406 // TODO only computerator/assembler:
407 int line;
408 };
409
410 static inline struct ir3_instruction *
411 ir3_neighbor_first(struct ir3_instruction *instr)
412 {
413 int cnt = 0;
414 while (instr->cp.left) {
415 instr = instr->cp.left;
416 if (++cnt > 0xffff) {
417 debug_assert(0);
418 break;
419 }
420 }
421 return instr;
422 }
423
424 static inline int ir3_neighbor_count(struct ir3_instruction *instr)
425 {
426 int num = 1;
427
428 debug_assert(!instr->cp.left);
429
430 while (instr->cp.right) {
431 num++;
432 instr = instr->cp.right;
433 if (num > 0xffff) {
434 debug_assert(0);
435 break;
436 }
437 }
438
439 return num;
440 }
441
442 struct ir3 {
443 struct ir3_compiler *compiler;
444 gl_shader_stage type;
445
446 DECLARE_ARRAY(struct ir3_instruction *, inputs);
447 DECLARE_ARRAY(struct ir3_instruction *, outputs);
448
449 /* Track bary.f (and ldlv) instructions.. this is needed in
450 * scheduling to ensure that all varying fetches happen before
451 * any potential kill instructions. The hw gets grumpy if all
452 * threads in a group are killed before the last bary.f gets
453 * a chance to signal end of input (ei).
454 */
455 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
456
457 /* Track all indirect instructions (read and write). To avoid
458 * deadlock scenario where an address register gets scheduled,
459 * but other dependent src instructions cannot be scheduled due
460 * to dependency on a *different* address register value, the
461 * scheduler needs to ensure that all dependencies other than
462 * the instruction other than the address register are scheduled
463 * before the one that writes the address register. Having a
464 * convenient list of instructions that reference some address
465 * register simplifies this.
466 */
467 DECLARE_ARRAY(struct ir3_instruction *, indirects);
468
469 /* and same for instructions that consume predicate register: */
470 DECLARE_ARRAY(struct ir3_instruction *, predicates);
471
472 /* Track texture sample instructions which need texture state
473 * patched in (for astc-srgb workaround):
474 */
475 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
476
477 /* List of blocks: */
478 struct list_head block_list;
479
480 /* List of ir3_array's: */
481 struct list_head array_list;
482
483 unsigned max_sun; /* max Sethi–Ullman number */
484
485 #ifdef DEBUG
486 unsigned block_count, instr_count;
487 #endif
488 };
489
490 struct ir3_array {
491 struct list_head node;
492 unsigned length;
493 unsigned id;
494
495 struct nir_register *r;
496
497 /* To avoid array write's from getting DCE'd, keep track of the
498 * most recent write. Any array access depends on the most
499 * recent write. This way, nothing depends on writes after the
500 * last read. But all the writes that happen before that have
501 * something depending on them
502 */
503 struct ir3_instruction *last_write;
504
505 /* extra stuff used in RA pass: */
506 unsigned base; /* base vreg name */
507 unsigned reg; /* base physical reg */
508 uint16_t start_ip, end_ip;
509
510 /* Indicates if half-precision */
511 bool half;
512 };
513
514 struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
515
516 struct ir3_block {
517 struct list_head node;
518 struct ir3 *shader;
519
520 const struct nir_block *nblock;
521
522 struct list_head instr_list; /* list of ir3_instruction */
523
524 /* each block has either one or two successors.. in case of
525 * two successors, 'condition' decides which one to follow.
526 * A block preceding an if/else has two successors.
527 */
528 struct ir3_instruction *condition;
529 struct ir3_block *successors[2];
530
531 struct set *predecessors; /* set of ir3_block */
532
533 uint16_t start_ip, end_ip;
534
535 /* Track instructions which do not write a register but other-
536 * wise must not be discarded (such as kill, stg, etc)
537 */
538 DECLARE_ARRAY(struct ir3_instruction *, keeps);
539
540 /* used for per-pass extra block data. Mainly used right
541 * now in RA step to track livein/liveout.
542 */
543 void *data;
544
545 #ifdef DEBUG
546 uint32_t serialno;
547 #endif
548 };
549
550 static inline uint32_t
551 block_id(struct ir3_block *block)
552 {
553 #ifdef DEBUG
554 return block->serialno;
555 #else
556 return (uint32_t)(unsigned long)block;
557 #endif
558 }
559
560 struct ir3 * ir3_create(struct ir3_compiler *compiler, gl_shader_stage type);
561 void ir3_destroy(struct ir3 *shader);
562 void * ir3_assemble(struct ir3 *shader,
563 struct ir3_info *info, uint32_t gpu_id);
564 void * ir3_alloc(struct ir3 *shader, int sz);
565
566 struct ir3_block * ir3_block_create(struct ir3 *shader);
567
568 struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
569 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
570 opc_t opc, int nreg);
571 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
572 void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
573 const char *ir3_instr_name(struct ir3_instruction *instr);
574
575 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
576 int num, int flags);
577 struct ir3_register * ir3_reg_clone(struct ir3 *shader,
578 struct ir3_register *reg);
579
580 void ir3_instr_set_address(struct ir3_instruction *instr,
581 struct ir3_instruction *addr);
582
583 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
584 {
585 if (instr->flags & IR3_INSTR_MARK)
586 return true; /* already visited */
587 instr->flags |= IR3_INSTR_MARK;
588 return false;
589 }
590
591 void ir3_block_clear_mark(struct ir3_block *block);
592 void ir3_clear_mark(struct ir3 *shader);
593
594 unsigned ir3_count_instructions(struct ir3 *ir);
595
596
597 #define MAX_ARRAYS 16
598
599 /* comp:
600 * 0 - x
601 * 1 - y
602 * 2 - z
603 * 3 - w
604 */
605 static inline uint32_t regid(int num, int comp)
606 {
607 return (num << 2) | (comp & 0x3);
608 }
609
610 static inline uint32_t reg_num(struct ir3_register *reg)
611 {
612 return reg->num >> 2;
613 }
614
615 static inline uint32_t reg_comp(struct ir3_register *reg)
616 {
617 return reg->num & 0x3;
618 }
619
620 #define INVALID_REG regid(63, 0)
621 #define VALIDREG(r) ((r) != INVALID_REG)
622 #define CONDREG(r, val) COND(VALIDREG(r), (val))
623
624 static inline bool is_flow(struct ir3_instruction *instr)
625 {
626 return (opc_cat(instr->opc) == 0);
627 }
628
629 static inline bool is_kill(struct ir3_instruction *instr)
630 {
631 return instr->opc == OPC_KILL;
632 }
633
634 static inline bool is_nop(struct ir3_instruction *instr)
635 {
636 return instr->opc == OPC_NOP;
637 }
638
639 static inline bool is_same_type_reg(struct ir3_register *reg1,
640 struct ir3_register *reg2)
641 {
642 unsigned type_reg1 = (reg1->flags & (IR3_REG_HIGH | IR3_REG_HALF));
643 unsigned type_reg2 = (reg2->flags & (IR3_REG_HIGH | IR3_REG_HALF));
644
645 if (type_reg1 ^ type_reg2)
646 return false;
647 else
648 return true;
649 }
650
651 /* Is it a non-transformative (ie. not type changing) mov? This can
652 * also include absneg.s/absneg.f, which for the most part can be
653 * treated as a mov (single src argument).
654 */
655 static inline bool is_same_type_mov(struct ir3_instruction *instr)
656 {
657 struct ir3_register *dst;
658
659 switch (instr->opc) {
660 case OPC_MOV:
661 if (instr->cat1.src_type != instr->cat1.dst_type)
662 return false;
663 /* If the type of dest reg and src reg are different,
664 * it shouldn't be considered as same type mov
665 */
666 if (!is_same_type_reg(instr->regs[0], instr->regs[1]))
667 return false;
668 break;
669 case OPC_ABSNEG_F:
670 case OPC_ABSNEG_S:
671 if (instr->flags & IR3_INSTR_SAT)
672 return false;
673 /* If the type of dest reg and src reg are different,
674 * it shouldn't be considered as same type mov
675 */
676 if (!is_same_type_reg(instr->regs[0], instr->regs[1]))
677 return false;
678 break;
679 default:
680 return false;
681 }
682
683 dst = instr->regs[0];
684
685 /* mov's that write to a0.x or p0.x are special: */
686 if (dst->num == regid(REG_P0, 0))
687 return false;
688 if (dst->num == regid(REG_A0, 0))
689 return false;
690
691 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
692 return false;
693
694 return true;
695 }
696
697 /* A move from const, which changes size but not type, can also be
698 * folded into dest instruction in some cases.
699 */
700 static inline bool is_const_mov(struct ir3_instruction *instr)
701 {
702 if (instr->opc != OPC_MOV)
703 return false;
704
705 if (!(instr->regs[1]->flags & IR3_REG_CONST))
706 return false;
707
708 type_t src_type = instr->cat1.src_type;
709 type_t dst_type = instr->cat1.dst_type;
710
711 return (type_float(src_type) && type_float(dst_type)) ||
712 (type_uint(src_type) && type_uint(dst_type)) ||
713 (type_sint(src_type) && type_sint(dst_type));
714 }
715
716 static inline bool is_alu(struct ir3_instruction *instr)
717 {
718 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
719 }
720
721 static inline bool is_sfu(struct ir3_instruction *instr)
722 {
723 return (opc_cat(instr->opc) == 4);
724 }
725
726 static inline bool is_tex(struct ir3_instruction *instr)
727 {
728 return (opc_cat(instr->opc) == 5);
729 }
730
731 static inline bool is_tex_or_prefetch(struct ir3_instruction *instr)
732 {
733 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
734 }
735
736 static inline bool is_mem(struct ir3_instruction *instr)
737 {
738 return (opc_cat(instr->opc) == 6);
739 }
740
741 static inline bool is_barrier(struct ir3_instruction *instr)
742 {
743 return (opc_cat(instr->opc) == 7);
744 }
745
746 static inline bool
747 is_half(struct ir3_instruction *instr)
748 {
749 return !!(instr->regs[0]->flags & IR3_REG_HALF);
750 }
751
752 static inline bool
753 is_high(struct ir3_instruction *instr)
754 {
755 return !!(instr->regs[0]->flags & IR3_REG_HIGH);
756 }
757
758 static inline bool
759 is_store(struct ir3_instruction *instr)
760 {
761 /* these instructions, the "destination" register is
762 * actually a source, the address to store to.
763 */
764 switch (instr->opc) {
765 case OPC_STG:
766 case OPC_STGB:
767 case OPC_STIB:
768 case OPC_STP:
769 case OPC_STL:
770 case OPC_STLW:
771 case OPC_L2G:
772 case OPC_G2L:
773 return true;
774 default:
775 return false;
776 }
777 }
778
779 static inline bool is_load(struct ir3_instruction *instr)
780 {
781 switch (instr->opc) {
782 case OPC_LDG:
783 case OPC_LDGB:
784 case OPC_LDIB:
785 case OPC_LDL:
786 case OPC_LDP:
787 case OPC_L2G:
788 case OPC_LDLW:
789 case OPC_LDC:
790 case OPC_LDLV:
791 /* probably some others too.. */
792 return true;
793 default:
794 return false;
795 }
796 }
797
798 static inline bool is_input(struct ir3_instruction *instr)
799 {
800 /* in some cases, ldlv is used to fetch varying without
801 * interpolation.. fortunately inloc is the first src
802 * register in either case
803 */
804 switch (instr->opc) {
805 case OPC_LDLV:
806 case OPC_BARY_F:
807 return true;
808 default:
809 return false;
810 }
811 }
812
813 static inline bool is_bool(struct ir3_instruction *instr)
814 {
815 switch (instr->opc) {
816 case OPC_CMPS_F:
817 case OPC_CMPS_S:
818 case OPC_CMPS_U:
819 return true;
820 default:
821 return false;
822 }
823 }
824
825 static inline bool is_meta(struct ir3_instruction *instr)
826 {
827 return (opc_cat(instr->opc) == -1);
828 }
829
830 static inline unsigned dest_regs(struct ir3_instruction *instr)
831 {
832 if ((instr->regs_count == 0) || is_store(instr) || is_flow(instr))
833 return 0;
834
835 return util_last_bit(instr->regs[0]->wrmask);
836 }
837
838 static inline bool writes_addr(struct ir3_instruction *instr)
839 {
840 if (instr->regs_count > 0) {
841 struct ir3_register *dst = instr->regs[0];
842 return reg_num(dst) == REG_A0;
843 }
844 return false;
845 }
846
847 static inline bool writes_pred(struct ir3_instruction *instr)
848 {
849 if (instr->regs_count > 0) {
850 struct ir3_register *dst = instr->regs[0];
851 return reg_num(dst) == REG_P0;
852 }
853 return false;
854 }
855
856 /* returns defining instruction for reg */
857 /* TODO better name */
858 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
859 {
860 if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
861 return reg->instr;
862 }
863 return NULL;
864 }
865
866 static inline bool conflicts(struct ir3_instruction *a,
867 struct ir3_instruction *b)
868 {
869 return (a && b) && (a != b);
870 }
871
872 static inline bool reg_gpr(struct ir3_register *r)
873 {
874 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
875 return false;
876 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
877 return false;
878 return true;
879 }
880
881 static inline type_t half_type(type_t type)
882 {
883 switch (type) {
884 case TYPE_F32: return TYPE_F16;
885 case TYPE_U32: return TYPE_U16;
886 case TYPE_S32: return TYPE_S16;
887 case TYPE_F16:
888 case TYPE_U16:
889 case TYPE_S16:
890 return type;
891 default:
892 assert(0);
893 return ~0;
894 }
895 }
896
897 /* some cat2 instructions (ie. those which are not float) can embed an
898 * immediate:
899 */
900 static inline bool ir3_cat2_int(opc_t opc)
901 {
902 switch (opc) {
903 case OPC_ADD_U:
904 case OPC_ADD_S:
905 case OPC_SUB_U:
906 case OPC_SUB_S:
907 case OPC_CMPS_U:
908 case OPC_CMPS_S:
909 case OPC_MIN_U:
910 case OPC_MIN_S:
911 case OPC_MAX_U:
912 case OPC_MAX_S:
913 case OPC_CMPV_U:
914 case OPC_CMPV_S:
915 case OPC_MUL_U24:
916 case OPC_MUL_S24:
917 case OPC_MULL_U:
918 case OPC_CLZ_S:
919 case OPC_ABSNEG_S:
920 case OPC_AND_B:
921 case OPC_OR_B:
922 case OPC_NOT_B:
923 case OPC_XOR_B:
924 case OPC_BFREV_B:
925 case OPC_CLZ_B:
926 case OPC_SHL_B:
927 case OPC_SHR_B:
928 case OPC_ASHR_B:
929 case OPC_MGEN_B:
930 case OPC_GETBIT_B:
931 case OPC_CBITS_B:
932 case OPC_BARY_F:
933 return true;
934
935 default:
936 return false;
937 }
938 }
939
940 static inline bool ir3_cat2_float(opc_t opc)
941 {
942 switch (opc) {
943 case OPC_ADD_F:
944 case OPC_MIN_F:
945 case OPC_MAX_F:
946 case OPC_MUL_F:
947 case OPC_SIGN_F:
948 case OPC_CMPS_F:
949 case OPC_ABSNEG_F:
950 case OPC_CMPV_F:
951 case OPC_FLOOR_F:
952 case OPC_CEIL_F:
953 case OPC_RNDNE_F:
954 case OPC_RNDAZ_F:
955 case OPC_TRUNC_F:
956 return true;
957
958 default:
959 return false;
960 }
961 }
962
963 static inline bool ir3_cat3_float(opc_t opc)
964 {
965 switch (opc) {
966 case OPC_MAD_F16:
967 case OPC_MAD_F32:
968 case OPC_SEL_F16:
969 case OPC_SEL_F32:
970 return true;
971 default:
972 return false;
973 }
974 }
975
976 /* map cat2 instruction to valid abs/neg flags: */
977 static inline unsigned ir3_cat2_absneg(opc_t opc)
978 {
979 switch (opc) {
980 case OPC_ADD_F:
981 case OPC_MIN_F:
982 case OPC_MAX_F:
983 case OPC_MUL_F:
984 case OPC_SIGN_F:
985 case OPC_CMPS_F:
986 case OPC_ABSNEG_F:
987 case OPC_CMPV_F:
988 case OPC_FLOOR_F:
989 case OPC_CEIL_F:
990 case OPC_RNDNE_F:
991 case OPC_RNDAZ_F:
992 case OPC_TRUNC_F:
993 case OPC_BARY_F:
994 return IR3_REG_FABS | IR3_REG_FNEG;
995
996 case OPC_ADD_U:
997 case OPC_ADD_S:
998 case OPC_SUB_U:
999 case OPC_SUB_S:
1000 case OPC_CMPS_U:
1001 case OPC_CMPS_S:
1002 case OPC_MIN_U:
1003 case OPC_MIN_S:
1004 case OPC_MAX_U:
1005 case OPC_MAX_S:
1006 case OPC_CMPV_U:
1007 case OPC_CMPV_S:
1008 case OPC_MUL_U24:
1009 case OPC_MUL_S24:
1010 case OPC_MULL_U:
1011 case OPC_CLZ_S:
1012 return 0;
1013
1014 case OPC_ABSNEG_S:
1015 return IR3_REG_SABS | IR3_REG_SNEG;
1016
1017 case OPC_AND_B:
1018 case OPC_OR_B:
1019 case OPC_NOT_B:
1020 case OPC_XOR_B:
1021 case OPC_BFREV_B:
1022 case OPC_CLZ_B:
1023 case OPC_SHL_B:
1024 case OPC_SHR_B:
1025 case OPC_ASHR_B:
1026 case OPC_MGEN_B:
1027 case OPC_GETBIT_B:
1028 case OPC_CBITS_B:
1029 return IR3_REG_BNOT;
1030
1031 default:
1032 return 0;
1033 }
1034 }
1035
1036 /* map cat3 instructions to valid abs/neg flags: */
1037 static inline unsigned ir3_cat3_absneg(opc_t opc)
1038 {
1039 switch (opc) {
1040 case OPC_MAD_F16:
1041 case OPC_MAD_F32:
1042 case OPC_SEL_F16:
1043 case OPC_SEL_F32:
1044 return IR3_REG_FNEG;
1045
1046 case OPC_MAD_U16:
1047 case OPC_MADSH_U16:
1048 case OPC_MAD_S16:
1049 case OPC_MADSH_M16:
1050 case OPC_MAD_U24:
1051 case OPC_MAD_S24:
1052 case OPC_SEL_S16:
1053 case OPC_SEL_S32:
1054 case OPC_SAD_S16:
1055 case OPC_SAD_S32:
1056 /* neg *may* work on 3rd src.. */
1057
1058 case OPC_SEL_B16:
1059 case OPC_SEL_B32:
1060
1061 default:
1062 return 0;
1063 }
1064 }
1065
1066 #define MASK(n) ((1 << (n)) - 1)
1067
1068 /* iterator for an instructions's sources (reg), also returns src #: */
1069 #define foreach_src_n(__srcreg, __n, __instr) \
1070 if ((__instr)->regs_count) \
1071 for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
1072 if ((__srcreg = (__instr)->regs[__n + 1]))
1073
1074 /* iterator for an instructions's sources (reg): */
1075 #define foreach_src(__srcreg, __instr) \
1076 foreach_src_n(__srcreg, __i, __instr)
1077
1078 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
1079 {
1080 unsigned cnt = instr->regs_count + instr->deps_count;
1081 if (instr->address)
1082 cnt++;
1083 return cnt;
1084 }
1085
1086 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
1087 {
1088 if (n == (instr->regs_count + instr->deps_count))
1089 return instr->address;
1090 if (n >= instr->regs_count)
1091 return instr->deps[n - instr->regs_count];
1092 return ssa(instr->regs[n]);
1093 }
1094
1095 static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
1096 {
1097 if (n == (instr->regs_count + instr->deps_count))
1098 return false;
1099 if (n >= instr->regs_count)
1100 return true;
1101 return false;
1102 }
1103
1104 #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
1105
1106 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1107 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1108 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
1109 if ((__srcinst = __ssa_src_n(__instr, __n)))
1110
1111 /* iterator for an instruction's SSA sources (instr): */
1112 #define foreach_ssa_src(__srcinst, __instr) \
1113 foreach_ssa_src_n(__srcinst, __i, __instr)
1114
1115 /* iterators for shader inputs: */
1116 #define foreach_input_n(__ininstr, __cnt, __ir) \
1117 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1118 if ((__ininstr = (__ir)->inputs[__cnt]))
1119 #define foreach_input(__ininstr, __ir) \
1120 foreach_input_n(__ininstr, __i, __ir)
1121
1122 /* iterators for shader outputs: */
1123 #define foreach_output_n(__outinstr, __cnt, __ir) \
1124 for (unsigned __cnt = 0; __cnt < (__ir)->outputs_count; __cnt++) \
1125 if ((__outinstr = (__ir)->outputs[__cnt]))
1126 #define foreach_output(__outinstr, __ir) \
1127 foreach_output_n(__outinstr, __i, __ir)
1128
1129 /* iterators for instructions: */
1130 #define foreach_instr(__instr, __list) \
1131 list_for_each_entry(struct ir3_instruction, __instr, __list, node)
1132 #define foreach_instr_rev(__instr, __list) \
1133 list_for_each_entry_rev(struct ir3_instruction, __instr, __list, node)
1134 #define foreach_instr_safe(__instr, __list) \
1135 list_for_each_entry_safe(struct ir3_instruction, __instr, __list, node)
1136
1137 /* iterators for blocks: */
1138 #define foreach_block(__block, __list) \
1139 list_for_each_entry(struct ir3_block, __block, __list, node)
1140 #define foreach_block_safe(__block, __list) \
1141 list_for_each_entry_safe(struct ir3_block, __block, __list, node)
1142
1143 /* iterators for arrays: */
1144 #define foreach_array(__array, __list) \
1145 list_for_each_entry(struct ir3_array, __array, __list, node)
1146
1147 /* dump: */
1148 void ir3_print(struct ir3 *ir);
1149 void ir3_print_instr(struct ir3_instruction *instr);
1150
1151 /* delay calculation: */
1152 int ir3_delayslots(struct ir3_instruction *assigner,
1153 struct ir3_instruction *consumer, unsigned n, bool soft);
1154 unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
1155 bool soft, bool pred);
1156 void ir3_remove_nops(struct ir3 *ir);
1157
1158 /* depth calculation: */
1159 struct ir3_shader_variant;
1160 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
1161 void ir3_depth(struct ir3 *ir, struct ir3_shader_variant *so);
1162
1163 /* fp16 conversion folding */
1164 void ir3_cf(struct ir3 *ir);
1165
1166 /* copy-propagate: */
1167 void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1168
1169 /* group neighbors and insert mov's to resolve conflicts: */
1170 void ir3_group(struct ir3 *ir);
1171
1172 /* Sethi–Ullman numbering: */
1173 void ir3_sun(struct ir3 *ir);
1174
1175 /* scheduling: */
1176 void ir3_sched_add_deps(struct ir3 *ir);
1177 int ir3_sched(struct ir3 *ir);
1178
1179 struct ir3_context;
1180 int ir3_postsched(struct ir3_context *ctx);
1181
1182 bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
1183
1184 /* register assignment: */
1185 struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
1186 int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
1187
1188 /* legalize: */
1189 void ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
1190
1191 static inline bool
1192 ir3_has_latency_to_hide(struct ir3 *ir)
1193 {
1194 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
1195 * know the nature of the fragment shader. Just assume it will have
1196 * latency to hide:
1197 */
1198 if (ir->type != MESA_SHADER_FRAGMENT)
1199 return true;
1200
1201 foreach_block (block, &ir->block_list) {
1202 foreach_instr (instr, &block->instr_list) {
1203 if (is_tex_or_prefetch(instr))
1204 return true;
1205
1206 if (is_load(instr)) {
1207 switch (instr->opc) {
1208 case OPC_LDLV:
1209 case OPC_LDL:
1210 case OPC_LDLW:
1211 break;
1212 default:
1213 return true;
1214 }
1215 }
1216 }
1217 }
1218
1219 return false;
1220 }
1221
1222 /* ************************************************************************* */
1223 /* instruction helpers */
1224
1225 /* creates SSA src of correct type (ie. half vs full precision) */
1226 static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
1227 struct ir3_instruction *src, unsigned flags)
1228 {
1229 struct ir3_register *reg;
1230 if (src->regs[0]->flags & IR3_REG_HALF)
1231 flags |= IR3_REG_HALF;
1232 reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
1233 reg->instr = src;
1234 reg->wrmask = src->regs[0]->wrmask;
1235 return reg;
1236 }
1237
1238 static inline struct ir3_register * __ssa_dst(struct ir3_instruction *instr)
1239 {
1240 struct ir3_register *reg = ir3_reg_create(instr, 0, 0);
1241 reg->flags |= IR3_REG_SSA;
1242 return reg;
1243 }
1244
1245 static inline struct ir3_instruction *
1246 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1247 {
1248 struct ir3_instruction *mov;
1249 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1250
1251 mov = ir3_instr_create(block, OPC_MOV);
1252 mov->cat1.src_type = type;
1253 mov->cat1.dst_type = type;
1254 __ssa_dst(mov)->flags |= flags;
1255 ir3_reg_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
1256
1257 return mov;
1258 }
1259
1260 static inline struct ir3_instruction *
1261 create_immed(struct ir3_block *block, uint32_t val)
1262 {
1263 return create_immed_typed(block, val, TYPE_U32);
1264 }
1265
1266 static inline struct ir3_instruction *
1267 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
1268 {
1269 struct ir3_instruction *mov;
1270 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1271
1272 mov = ir3_instr_create(block, OPC_MOV);
1273 mov->cat1.src_type = type;
1274 mov->cat1.dst_type = type;
1275 __ssa_dst(mov)->flags |= flags;
1276 ir3_reg_create(mov, n, IR3_REG_CONST | flags);
1277
1278 return mov;
1279 }
1280
1281 static inline struct ir3_instruction *
1282 create_uniform(struct ir3_block *block, unsigned n)
1283 {
1284 return create_uniform_typed(block, n, TYPE_F32);
1285 }
1286
1287 static inline struct ir3_instruction *
1288 create_uniform_indirect(struct ir3_block *block, int n,
1289 struct ir3_instruction *address)
1290 {
1291 struct ir3_instruction *mov;
1292
1293 mov = ir3_instr_create(block, OPC_MOV);
1294 mov->cat1.src_type = TYPE_U32;
1295 mov->cat1.dst_type = TYPE_U32;
1296 __ssa_dst(mov);
1297 ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1298
1299 ir3_instr_set_address(mov, address);
1300
1301 return mov;
1302 }
1303
1304 static inline struct ir3_instruction *
1305 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1306 {
1307 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1308 __ssa_dst(instr);
1309 if (src->regs[0]->flags & IR3_REG_ARRAY) {
1310 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1311 src_reg->array = src->regs[0]->array;
1312 } else {
1313 __ssa_src(instr, src, src->regs[0]->flags & IR3_REG_HIGH);
1314 }
1315 debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
1316 instr->cat1.src_type = type;
1317 instr->cat1.dst_type = type;
1318 return instr;
1319 }
1320
1321 static inline struct ir3_instruction *
1322 ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
1323 type_t src_type, type_t dst_type)
1324 {
1325 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1326 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
1327 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
1328
1329 debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
1330
1331 __ssa_dst(instr)->flags |= dst_flags;
1332 __ssa_src(instr, src, 0);
1333 instr->cat1.src_type = src_type;
1334 instr->cat1.dst_type = dst_type;
1335 debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
1336 return instr;
1337 }
1338
1339 static inline struct ir3_instruction *
1340 ir3_NOP(struct ir3_block *block)
1341 {
1342 return ir3_instr_create(block, OPC_NOP);
1343 }
1344
1345 #define IR3_INSTR_0 0
1346
1347 #define __INSTR0(flag, name, opc) \
1348 static inline struct ir3_instruction * \
1349 ir3_##name(struct ir3_block *block) \
1350 { \
1351 struct ir3_instruction *instr = \
1352 ir3_instr_create(block, opc); \
1353 instr->flags |= flag; \
1354 return instr; \
1355 }
1356 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
1357 #define INSTR0(name) __INSTR0(0, name, OPC_##name)
1358
1359 #define __INSTR1(flag, name, opc) \
1360 static inline struct ir3_instruction * \
1361 ir3_##name(struct ir3_block *block, \
1362 struct ir3_instruction *a, unsigned aflags) \
1363 { \
1364 struct ir3_instruction *instr = \
1365 ir3_instr_create(block, opc); \
1366 __ssa_dst(instr); \
1367 __ssa_src(instr, a, aflags); \
1368 instr->flags |= flag; \
1369 return instr; \
1370 }
1371 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, name##_##f, OPC_##name)
1372 #define INSTR1(name) __INSTR1(0, name, OPC_##name)
1373
1374 #define __INSTR2(flag, name, opc) \
1375 static inline struct ir3_instruction * \
1376 ir3_##name(struct ir3_block *block, \
1377 struct ir3_instruction *a, unsigned aflags, \
1378 struct ir3_instruction *b, unsigned bflags) \
1379 { \
1380 struct ir3_instruction *instr = \
1381 ir3_instr_create(block, opc); \
1382 __ssa_dst(instr); \
1383 __ssa_src(instr, a, aflags); \
1384 __ssa_src(instr, b, bflags); \
1385 instr->flags |= flag; \
1386 return instr; \
1387 }
1388 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
1389 #define INSTR2(name) __INSTR2(0, name, OPC_##name)
1390
1391 #define __INSTR3(flag, name, opc) \
1392 static inline struct ir3_instruction * \
1393 ir3_##name(struct ir3_block *block, \
1394 struct ir3_instruction *a, unsigned aflags, \
1395 struct ir3_instruction *b, unsigned bflags, \
1396 struct ir3_instruction *c, unsigned cflags) \
1397 { \
1398 struct ir3_instruction *instr = \
1399 ir3_instr_create2(block, opc, 4); \
1400 __ssa_dst(instr); \
1401 __ssa_src(instr, a, aflags); \
1402 __ssa_src(instr, b, bflags); \
1403 __ssa_src(instr, c, cflags); \
1404 instr->flags |= flag; \
1405 return instr; \
1406 }
1407 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, name##_##f, OPC_##name)
1408 #define INSTR3(name) __INSTR3(0, name, OPC_##name)
1409
1410 #define __INSTR4(flag, name, opc) \
1411 static inline struct ir3_instruction * \
1412 ir3_##name(struct ir3_block *block, \
1413 struct ir3_instruction *a, unsigned aflags, \
1414 struct ir3_instruction *b, unsigned bflags, \
1415 struct ir3_instruction *c, unsigned cflags, \
1416 struct ir3_instruction *d, unsigned dflags) \
1417 { \
1418 struct ir3_instruction *instr = \
1419 ir3_instr_create2(block, opc, 5); \
1420 __ssa_dst(instr); \
1421 __ssa_src(instr, a, aflags); \
1422 __ssa_src(instr, b, bflags); \
1423 __ssa_src(instr, c, cflags); \
1424 __ssa_src(instr, d, dflags); \
1425 instr->flags |= flag; \
1426 return instr; \
1427 }
1428 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, name##_##f, OPC_##name)
1429 #define INSTR4(name) __INSTR4(0, name, OPC_##name)
1430
1431 /* cat0 instructions: */
1432 INSTR1(BR)
1433 INSTR0(JUMP)
1434 INSTR1(KILL)
1435 INSTR0(END)
1436 INSTR0(CHSH)
1437 INSTR0(CHMASK)
1438 INSTR1(IF)
1439 INSTR0(ELSE)
1440 INSTR0(ENDIF)
1441
1442 /* cat2 instructions, most 2 src but some 1 src: */
1443 INSTR2(ADD_F)
1444 INSTR2(MIN_F)
1445 INSTR2(MAX_F)
1446 INSTR2(MUL_F)
1447 INSTR1(SIGN_F)
1448 INSTR2(CMPS_F)
1449 INSTR1(ABSNEG_F)
1450 INSTR2(CMPV_F)
1451 INSTR1(FLOOR_F)
1452 INSTR1(CEIL_F)
1453 INSTR1(RNDNE_F)
1454 INSTR1(RNDAZ_F)
1455 INSTR1(TRUNC_F)
1456 INSTR2(ADD_U)
1457 INSTR2(ADD_S)
1458 INSTR2(SUB_U)
1459 INSTR2(SUB_S)
1460 INSTR2(CMPS_U)
1461 INSTR2(CMPS_S)
1462 INSTR2(MIN_U)
1463 INSTR2(MIN_S)
1464 INSTR2(MAX_U)
1465 INSTR2(MAX_S)
1466 INSTR1(ABSNEG_S)
1467 INSTR2(AND_B)
1468 INSTR2(OR_B)
1469 INSTR1(NOT_B)
1470 INSTR2(XOR_B)
1471 INSTR2(CMPV_U)
1472 INSTR2(CMPV_S)
1473 INSTR2(MUL_U24)
1474 INSTR2(MUL_S24)
1475 INSTR2(MULL_U)
1476 INSTR1(BFREV_B)
1477 INSTR1(CLZ_S)
1478 INSTR1(CLZ_B)
1479 INSTR2(SHL_B)
1480 INSTR2(SHR_B)
1481 INSTR2(ASHR_B)
1482 INSTR2(BARY_F)
1483 INSTR2(MGEN_B)
1484 INSTR2(GETBIT_B)
1485 INSTR1(SETRM)
1486 INSTR1(CBITS_B)
1487 INSTR2(SHB)
1488 INSTR2(MSAD)
1489
1490 /* cat3 instructions: */
1491 INSTR3(MAD_U16)
1492 INSTR3(MADSH_U16)
1493 INSTR3(MAD_S16)
1494 INSTR3(MADSH_M16)
1495 INSTR3(MAD_U24)
1496 INSTR3(MAD_S24)
1497 INSTR3(MAD_F16)
1498 INSTR3(MAD_F32)
1499 INSTR3(SEL_B16)
1500 INSTR3(SEL_B32)
1501 INSTR3(SEL_S16)
1502 INSTR3(SEL_S32)
1503 INSTR3(SEL_F16)
1504 INSTR3(SEL_F32)
1505 INSTR3(SAD_S16)
1506 INSTR3(SAD_S32)
1507
1508 /* cat4 instructions: */
1509 INSTR1(RCP)
1510 INSTR1(RSQ)
1511 INSTR1(HRSQ)
1512 INSTR1(LOG2)
1513 INSTR1(HLOG2)
1514 INSTR1(EXP2)
1515 INSTR1(HEXP2)
1516 INSTR1(SIN)
1517 INSTR1(COS)
1518 INSTR1(SQRT)
1519
1520 /* cat5 instructions: */
1521 INSTR1(DSX)
1522 INSTR1(DSXPP_1)
1523 INSTR1(DSY)
1524 INSTR1(DSYPP_1)
1525 INSTR1F(3D, DSX)
1526 INSTR1F(3D, DSY)
1527 INSTR1(RGETPOS)
1528
1529 static inline struct ir3_instruction *
1530 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
1531 unsigned wrmask, unsigned flags, struct ir3_instruction *samp_tex,
1532 struct ir3_instruction *src0, struct ir3_instruction *src1)
1533 {
1534 struct ir3_instruction *sam;
1535
1536 sam = ir3_instr_create(block, opc);
1537 sam->flags |= flags | IR3_INSTR_S2EN;
1538 __ssa_dst(sam)->wrmask = wrmask;
1539 __ssa_src(sam, samp_tex, IR3_REG_HALF);
1540 if (src0) {
1541 __ssa_src(sam, src0, 0)->wrmask = (1 << (src0->regs_count - 1)) - 1;
1542 }
1543 if (src1) {
1544 __ssa_src(sam, src1, 0)->wrmask =(1 << (src1->regs_count - 1)) - 1;
1545 }
1546 sam->cat5.type = type;
1547
1548 return sam;
1549 }
1550
1551 /* cat6 instructions: */
1552 INSTR2(LDLV)
1553 INSTR3(LDG)
1554 INSTR3(LDL)
1555 INSTR3(LDLW)
1556 INSTR3(STG)
1557 INSTR3(STL)
1558 INSTR3(STLW)
1559 INSTR1(RESINFO)
1560 INSTR1(RESFMT)
1561 INSTR2(ATOMIC_ADD)
1562 INSTR2(ATOMIC_SUB)
1563 INSTR2(ATOMIC_XCHG)
1564 INSTR2(ATOMIC_INC)
1565 INSTR2(ATOMIC_DEC)
1566 INSTR2(ATOMIC_CMPXCHG)
1567 INSTR2(ATOMIC_MIN)
1568 INSTR2(ATOMIC_MAX)
1569 INSTR2(ATOMIC_AND)
1570 INSTR2(ATOMIC_OR)
1571 INSTR2(ATOMIC_XOR)
1572 #if GPU >= 600
1573 INSTR3(STIB);
1574 INSTR2(LDIB);
1575 INSTR3F(G, ATOMIC_ADD)
1576 INSTR3F(G, ATOMIC_SUB)
1577 INSTR3F(G, ATOMIC_XCHG)
1578 INSTR3F(G, ATOMIC_INC)
1579 INSTR3F(G, ATOMIC_DEC)
1580 INSTR3F(G, ATOMIC_CMPXCHG)
1581 INSTR3F(G, ATOMIC_MIN)
1582 INSTR3F(G, ATOMIC_MAX)
1583 INSTR3F(G, ATOMIC_AND)
1584 INSTR3F(G, ATOMIC_OR)
1585 INSTR3F(G, ATOMIC_XOR)
1586 #elif GPU >= 400
1587 INSTR3(LDGB)
1588 INSTR4(STGB)
1589 INSTR4(STIB)
1590 INSTR4F(G, ATOMIC_ADD)
1591 INSTR4F(G, ATOMIC_SUB)
1592 INSTR4F(G, ATOMIC_XCHG)
1593 INSTR4F(G, ATOMIC_INC)
1594 INSTR4F(G, ATOMIC_DEC)
1595 INSTR4F(G, ATOMIC_CMPXCHG)
1596 INSTR4F(G, ATOMIC_MIN)
1597 INSTR4F(G, ATOMIC_MAX)
1598 INSTR4F(G, ATOMIC_AND)
1599 INSTR4F(G, ATOMIC_OR)
1600 INSTR4F(G, ATOMIC_XOR)
1601 #endif
1602
1603 INSTR4F(G, STG)
1604
1605 /* cat7 instructions: */
1606 INSTR0(BAR)
1607 INSTR0(FENCE)
1608
1609 /* meta instructions: */
1610 INSTR0(META_TEX_PREFETCH);
1611
1612 /* ************************************************************************* */
1613 /* split this out or find some helper to use.. like main/bitset.h.. */
1614
1615 #include <string.h>
1616 #include "util/bitset.h"
1617
1618 #define MAX_REG 256
1619
1620 typedef BITSET_DECLARE(regmask_t, 2 * MAX_REG);
1621
1622 static inline bool
1623 __regmask_get(regmask_t *regmask, struct ir3_register *reg, unsigned n)
1624 {
1625 if (reg->merged) {
1626 /* a6xx+ case, with merged register file, we track things in terms
1627 * of half-precision registers, with a full precisions register
1628 * using two half-precision slots:
1629 */
1630 if (reg->flags & IR3_REG_HALF) {
1631 return BITSET_TEST(*regmask, n);
1632 } else {
1633 n *= 2;
1634 return BITSET_TEST(*regmask, n) || BITSET_TEST(*regmask, n+1);
1635 }
1636 } else {
1637 /* pre a6xx case, with separate register file for half and full
1638 * precision:
1639 */
1640 if (reg->flags & IR3_REG_HALF)
1641 n += MAX_REG;
1642 return BITSET_TEST(*regmask, n);
1643 }
1644 }
1645
1646 static inline void
1647 __regmask_set(regmask_t *regmask, struct ir3_register *reg, unsigned n)
1648 {
1649 if (reg->merged) {
1650 /* a6xx+ case, with merged register file, we track things in terms
1651 * of half-precision registers, with a full precisions register
1652 * using two half-precision slots:
1653 */
1654 if (reg->flags & IR3_REG_HALF) {
1655 BITSET_SET(*regmask, n);
1656 } else {
1657 n *= 2;
1658 BITSET_SET(*regmask, n);
1659 BITSET_SET(*regmask, n+1);
1660 }
1661 } else {
1662 /* pre a6xx case, with separate register file for half and full
1663 * precision:
1664 */
1665 if (reg->flags & IR3_REG_HALF)
1666 n += MAX_REG;
1667 BITSET_SET(*regmask, n);
1668 }
1669 }
1670
1671 static inline void regmask_init(regmask_t *regmask)
1672 {
1673 memset(regmask, 0, sizeof(*regmask));
1674 }
1675
1676 static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
1677 {
1678 if (reg->flags & IR3_REG_RELATIV) {
1679 for (unsigned i = 0; i < reg->size; i++)
1680 __regmask_set(regmask, reg, reg->array.offset + i);
1681 } else {
1682 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
1683 if (mask & 1)
1684 __regmask_set(regmask, reg, n);
1685 }
1686 }
1687
1688 static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
1689 {
1690 unsigned i;
1691 for (i = 0; i < ARRAY_SIZE(*dst); i++)
1692 (*dst)[i] = (*a)[i] | (*b)[i];
1693 }
1694
1695 static inline bool regmask_get(regmask_t *regmask,
1696 struct ir3_register *reg)
1697 {
1698 if (reg->flags & IR3_REG_RELATIV) {
1699 for (unsigned i = 0; i < reg->size; i++)
1700 if (__regmask_get(regmask, reg, reg->array.offset + i))
1701 return true;
1702 } else {
1703 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++)
1704 if (mask & 1)
1705 if (__regmask_get(regmask, reg, n))
1706 return true;
1707 }
1708 return false;
1709 }
1710
1711 /* ************************************************************************* */
1712
1713 #endif /* IR3_H_ */