freedreno/ir3: scheduler improvements
[mesa.git] / src / freedreno / ir3 / ir3.h
1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdint.h>
28 #include <stdbool.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/u_debug.h"
35
36 #include "instr-a3xx.h"
37
38 /* low level intermediate representation of an adreno shader program */
39
40 struct ir3_compiler;
41 struct ir3;
42 struct ir3_instruction;
43 struct ir3_block;
44
45 struct ir3_info {
46 uint32_t gpu_id;
47 uint16_t sizedwords;
48 uint16_t instrs_count; /* expanded to account for rpt's */
49 /* NOTE: max_reg, etc, does not include registers not touched
50 * by the shader (ie. vertex fetched via VFD_DECODE but not
51 * touched by shader)
52 */
53 int8_t max_reg; /* highest GPR # used by shader */
54 int8_t max_half_reg;
55 int16_t max_const;
56
57 /* number of sync bits: */
58 uint16_t ss, sy;
59 };
60
61 struct ir3_register {
62 enum {
63 IR3_REG_CONST = 0x001,
64 IR3_REG_IMMED = 0x002,
65 IR3_REG_HALF = 0x004,
66 /* high registers are used for some things in compute shaders,
67 * for example. Seems to be for things that are global to all
68 * threads in a wave, so possibly these are global/shared by
69 * all the threads in the wave?
70 */
71 IR3_REG_HIGH = 0x008,
72 IR3_REG_RELATIV= 0x010,
73 IR3_REG_R = 0x020,
74 /* Most instructions, it seems, can do float abs/neg but not
75 * integer. The CP pass needs to know what is intended (int or
76 * float) in order to do the right thing. For this reason the
77 * abs/neg flags are split out into float and int variants. In
78 * addition, .b (bitwise) operations, the negate is actually a
79 * bitwise not, so split that out into a new flag to make it
80 * more clear.
81 */
82 IR3_REG_FNEG = 0x040,
83 IR3_REG_FABS = 0x080,
84 IR3_REG_SNEG = 0x100,
85 IR3_REG_SABS = 0x200,
86 IR3_REG_BNOT = 0x400,
87 IR3_REG_EVEN = 0x800,
88 IR3_REG_POS_INF= 0x1000,
89 /* (ei) flag, end-input? Set on last bary, presumably to signal
90 * that the shader needs no more input:
91 */
92 IR3_REG_EI = 0x2000,
93 /* meta-flags, for intermediate stages of IR, ie.
94 * before register assignment is done:
95 */
96 IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */
97 IR3_REG_ARRAY = 0x8000,
98
99 } flags;
100
101 bool merged : 1; /* half-regs conflict with full regs (ie >= a6xx) */
102
103 /* normal registers:
104 * the component is in the low two bits of the reg #, so
105 * rN.x becomes: (N << 2) | x
106 */
107 uint16_t num;
108 union {
109 /* immediate: */
110 int32_t iim_val;
111 uint32_t uim_val;
112 float fim_val;
113 /* relative: */
114 struct {
115 uint16_t id;
116 int16_t offset;
117 } array;
118 };
119
120 /* For IR3_REG_SSA, src registers contain ptr back to assigning
121 * instruction.
122 *
123 * For IR3_REG_ARRAY, the pointer is back to the last dependent
124 * array access (although the net effect is the same, it points
125 * back to a previous instruction that we depend on).
126 */
127 struct ir3_instruction *instr;
128
129 union {
130 /* used for cat5 instructions, but also for internal/IR level
131 * tracking of what registers are read/written by an instruction.
132 * wrmask may be a bad name since it is used to represent both
133 * src and dst that touch multiple adjacent registers.
134 */
135 unsigned wrmask;
136 /* for relative addressing, 32bits for array size is too small,
137 * but otoh we don't need to deal with disjoint sets, so instead
138 * use a simple size field (number of scalar components).
139 */
140 unsigned size;
141 };
142 };
143
144 /*
145 * Stupid/simple growable array implementation:
146 */
147 #define DECLARE_ARRAY(type, name) \
148 unsigned name ## _count, name ## _sz; \
149 type * name;
150
151 #define array_insert(ctx, arr, val) do { \
152 if (arr ## _count == arr ## _sz) { \
153 arr ## _sz = MAX2(2 * arr ## _sz, 16); \
154 arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
155 } \
156 arr[arr ##_count++] = val; \
157 } while (0)
158
159 struct ir3_instruction {
160 struct ir3_block *block;
161 opc_t opc;
162 enum {
163 /* (sy) flag is set on first instruction, and after sample
164 * instructions (probably just on RAW hazard).
165 */
166 IR3_INSTR_SY = 0x001,
167 /* (ss) flag is set on first instruction, and first instruction
168 * to depend on the result of "long" instructions (RAW hazard):
169 *
170 * rcp, rsq, log2, exp2, sin, cos, sqrt
171 *
172 * It seems to synchronize until all in-flight instructions are
173 * completed, for example:
174 *
175 * rsq hr1.w, hr1.w
176 * add.f hr2.z, (neg)hr2.z, hc0.y
177 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
178 * rsq hr2.x, hr2.x
179 * (rpt1)nop
180 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
181 * nop
182 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
183 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
184 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
185 *
186 * The last mul.f does not have (ss) set, presumably because the
187 * (ss) on the previous instruction does the job.
188 *
189 * The blob driver also seems to set it on WAR hazards, although
190 * not really clear if this is needed or just blob compiler being
191 * sloppy. So far I haven't found a case where removing the (ss)
192 * causes problems for WAR hazard, but I could just be getting
193 * lucky:
194 *
195 * rcp r1.y, r3.y
196 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
197 *
198 */
199 IR3_INSTR_SS = 0x002,
200 /* (jp) flag is set on jump targets:
201 */
202 IR3_INSTR_JP = 0x004,
203 IR3_INSTR_UL = 0x008,
204 IR3_INSTR_3D = 0x010,
205 IR3_INSTR_A = 0x020,
206 IR3_INSTR_O = 0x040,
207 IR3_INSTR_P = 0x080,
208 IR3_INSTR_S = 0x100,
209 IR3_INSTR_S2EN = 0x200,
210 IR3_INSTR_G = 0x400,
211 IR3_INSTR_SAT = 0x800,
212 /* meta-flags, for intermediate stages of IR, ie.
213 * before register assignment is done:
214 */
215 IR3_INSTR_MARK = 0x1000,
216 IR3_INSTR_UNUSED= 0x2000,
217 } flags;
218 uint8_t repeat;
219 uint8_t nop;
220 #ifdef DEBUG
221 unsigned regs_max;
222 #endif
223 unsigned regs_count;
224 struct ir3_register **regs;
225 union {
226 struct {
227 char inv;
228 char comp;
229 int immed;
230 struct ir3_block *target;
231 } cat0;
232 struct {
233 type_t src_type, dst_type;
234 } cat1;
235 struct {
236 enum {
237 IR3_COND_LT = 0,
238 IR3_COND_LE = 1,
239 IR3_COND_GT = 2,
240 IR3_COND_GE = 3,
241 IR3_COND_EQ = 4,
242 IR3_COND_NE = 5,
243 } condition;
244 } cat2;
245 struct {
246 unsigned samp, tex;
247 type_t type;
248 } cat5;
249 struct {
250 type_t type;
251 int src_offset;
252 int dst_offset;
253 int iim_val : 3; /* for ldgb/stgb, # of components */
254 unsigned d : 3;
255 bool typed : 1;
256 } cat6;
257 struct {
258 unsigned w : 1; /* write */
259 unsigned r : 1; /* read */
260 unsigned l : 1; /* local */
261 unsigned g : 1; /* global */
262 } cat7;
263 /* for meta-instructions, just used to hold extra data
264 * before instruction scheduling, etc
265 */
266 struct {
267 int off; /* component/offset */
268 } fo;
269 struct {
270 struct ir3_block *block;
271 } inout;
272 };
273
274 /* transient values used during various algorithms: */
275 union {
276 /* The instruction depth is the max dependency distance to output.
277 *
278 * You can also think of it as the "cost", if we did any sort of
279 * optimization for register footprint. Ie. a value that is just
280 * result of moving a const to a reg would have a low cost, so to
281 * it could make sense to duplicate the instruction at various
282 * points where the result is needed to reduce register footprint.
283 */
284 unsigned depth;
285 /* When we get to the RA stage, we no longer need depth, but
286 * we do need instruction's position/name:
287 */
288 struct {
289 uint16_t ip;
290 uint16_t name;
291 };
292 };
293
294 /* used for per-pass extra instruction data.
295 *
296 * TODO we should remove the per-pass data like this and 'use_count'
297 * and do something similar to what RA does w/ ir3_ra_instr_data..
298 * ie. use the ir3_count_instructions pass, and then use instr->ip
299 * to index into a table of pass-private data.
300 */
301 void *data;
302
303 int sun; /* Sethi–Ullman number, used by sched */
304 int use_count; /* currently just updated/used by cp */
305
306 /* Used during CP and RA stages. For fanin and shader inputs/
307 * outputs where we need a sequence of consecutive registers,
308 * keep track of each src instructions left (ie 'n-1') and right
309 * (ie 'n+1') neighbor. The front-end must insert enough mov's
310 * to ensure that each instruction has at most one left and at
311 * most one right neighbor. During the copy-propagation pass,
312 * we only remove mov's when we can preserve this constraint.
313 * And during the RA stage, we use the neighbor information to
314 * allocate a block of registers in one shot.
315 *
316 * TODO: maybe just add something like:
317 * struct ir3_instruction_ref {
318 * struct ir3_instruction *instr;
319 * unsigned cnt;
320 * }
321 *
322 * Or can we get away without the refcnt stuff? It seems like
323 * it should be overkill.. the problem is if, potentially after
324 * already eliminating some mov's, if you have a single mov that
325 * needs to be grouped with it's neighbors in two different
326 * places (ex. shader output and a fanin).
327 */
328 struct {
329 struct ir3_instruction *left, *right;
330 uint16_t left_cnt, right_cnt;
331 } cp;
332
333 /* an instruction can reference at most one address register amongst
334 * it's src/dst registers. Beyond that, you need to insert mov's.
335 *
336 * NOTE: do not write this directly, use ir3_instr_set_address()
337 */
338 struct ir3_instruction *address;
339
340 /* Tracking for additional dependent instructions. Used to handle
341 * barriers, WAR hazards for arrays/SSBOs/etc.
342 */
343 DECLARE_ARRAY(struct ir3_instruction *, deps);
344
345 /*
346 * From PoV of instruction scheduling, not execution (ie. ignores global/
347 * local distinction):
348 * shared image atomic SSBO everything
349 * barrier()/ - R/W R/W R/W R/W X
350 * groupMemoryBarrier()
351 * memoryBarrier() - R/W R/W
352 * (but only images declared coherent?)
353 * memoryBarrierAtomic() - R/W
354 * memoryBarrierBuffer() - R/W
355 * memoryBarrierImage() - R/W
356 * memoryBarrierShared() - R/W
357 *
358 * TODO I think for SSBO/image/shared, in cases where we can determine
359 * which variable is accessed, we don't need to care about accesses to
360 * different variables (unless declared coherent??)
361 */
362 enum {
363 IR3_BARRIER_EVERYTHING = 1 << 0,
364 IR3_BARRIER_SHARED_R = 1 << 1,
365 IR3_BARRIER_SHARED_W = 1 << 2,
366 IR3_BARRIER_IMAGE_R = 1 << 3,
367 IR3_BARRIER_IMAGE_W = 1 << 4,
368 IR3_BARRIER_BUFFER_R = 1 << 5,
369 IR3_BARRIER_BUFFER_W = 1 << 6,
370 IR3_BARRIER_ARRAY_R = 1 << 7,
371 IR3_BARRIER_ARRAY_W = 1 << 8,
372 } barrier_class, barrier_conflict;
373
374 /* Entry in ir3_block's instruction list: */
375 struct list_head node;
376
377 #ifdef DEBUG
378 uint32_t serialno;
379 #endif
380 };
381
382 static inline struct ir3_instruction *
383 ir3_neighbor_first(struct ir3_instruction *instr)
384 {
385 int cnt = 0;
386 while (instr->cp.left) {
387 instr = instr->cp.left;
388 if (++cnt > 0xffff) {
389 debug_assert(0);
390 break;
391 }
392 }
393 return instr;
394 }
395
396 static inline int ir3_neighbor_count(struct ir3_instruction *instr)
397 {
398 int num = 1;
399
400 debug_assert(!instr->cp.left);
401
402 while (instr->cp.right) {
403 num++;
404 instr = instr->cp.right;
405 if (num > 0xffff) {
406 debug_assert(0);
407 break;
408 }
409 }
410
411 return num;
412 }
413
414 struct ir3 {
415 struct ir3_compiler *compiler;
416 gl_shader_stage type;
417
418 unsigned ninputs, noutputs;
419 struct ir3_instruction **inputs;
420 struct ir3_instruction **outputs;
421
422 /* Track bary.f (and ldlv) instructions.. this is needed in
423 * scheduling to ensure that all varying fetches happen before
424 * any potential kill instructions. The hw gets grumpy if all
425 * threads in a group are killed before the last bary.f gets
426 * a chance to signal end of input (ei).
427 */
428 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
429
430 /* Track all indirect instructions (read and write). To avoid
431 * deadlock scenario where an address register gets scheduled,
432 * but other dependent src instructions cannot be scheduled due
433 * to dependency on a *different* address register value, the
434 * scheduler needs to ensure that all dependencies other than
435 * the instruction other than the address register are scheduled
436 * before the one that writes the address register. Having a
437 * convenient list of instructions that reference some address
438 * register simplifies this.
439 */
440 DECLARE_ARRAY(struct ir3_instruction *, indirects);
441
442 /* and same for instructions that consume predicate register: */
443 DECLARE_ARRAY(struct ir3_instruction *, predicates);
444
445 /* Track texture sample instructions which need texture state
446 * patched in (for astc-srgb workaround):
447 */
448 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
449
450 /* List of blocks: */
451 struct list_head block_list;
452
453 /* List of ir3_array's: */
454 struct list_head array_list;
455
456 unsigned max_sun; /* max Sethi–Ullman number */
457
458 #ifdef DEBUG
459 unsigned block_count, instr_count;
460 #endif
461 };
462
463 struct ir3_array {
464 struct list_head node;
465 unsigned length;
466 unsigned id;
467
468 struct nir_register *r;
469
470 /* To avoid array write's from getting DCE'd, keep track of the
471 * most recent write. Any array access depends on the most
472 * recent write. This way, nothing depends on writes after the
473 * last read. But all the writes that happen before that have
474 * something depending on them
475 */
476 struct ir3_instruction *last_write;
477
478 /* extra stuff used in RA pass: */
479 unsigned base; /* base vreg name */
480 unsigned reg; /* base physical reg */
481 uint16_t start_ip, end_ip;
482 };
483
484 struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
485
486 struct ir3_block {
487 struct list_head node;
488 struct ir3 *shader;
489
490 const struct nir_block *nblock;
491
492 struct list_head instr_list; /* list of ir3_instruction */
493
494 /* each block has either one or two successors.. in case of
495 * two successors, 'condition' decides which one to follow.
496 * A block preceding an if/else has two successors.
497 */
498 struct ir3_instruction *condition;
499 struct ir3_block *successors[2];
500
501 unsigned predecessors_count;
502 struct ir3_block **predecessors;
503
504 uint16_t start_ip, end_ip;
505
506 /* Track instructions which do not write a register but other-
507 * wise must not be discarded (such as kill, stg, etc)
508 */
509 DECLARE_ARRAY(struct ir3_instruction *, keeps);
510
511 /* used for per-pass extra block data. Mainly used right
512 * now in RA step to track livein/liveout.
513 */
514 void *data;
515
516 #ifdef DEBUG
517 uint32_t serialno;
518 #endif
519 };
520
521 static inline uint32_t
522 block_id(struct ir3_block *block)
523 {
524 #ifdef DEBUG
525 return block->serialno;
526 #else
527 return (uint32_t)(unsigned long)block;
528 #endif
529 }
530
531 struct ir3 * ir3_create(struct ir3_compiler *compiler,
532 gl_shader_stage type, unsigned nin, unsigned nout);
533 void ir3_destroy(struct ir3 *shader);
534 void * ir3_assemble(struct ir3 *shader,
535 struct ir3_info *info, uint32_t gpu_id);
536 void * ir3_alloc(struct ir3 *shader, int sz);
537
538 struct ir3_block * ir3_block_create(struct ir3 *shader);
539
540 struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
541 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
542 opc_t opc, int nreg);
543 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
544 void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
545 const char *ir3_instr_name(struct ir3_instruction *instr);
546
547 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
548 int num, int flags);
549 struct ir3_register * ir3_reg_clone(struct ir3 *shader,
550 struct ir3_register *reg);
551
552 void ir3_instr_set_address(struct ir3_instruction *instr,
553 struct ir3_instruction *addr);
554
555 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
556 {
557 if (instr->flags & IR3_INSTR_MARK)
558 return true; /* already visited */
559 instr->flags |= IR3_INSTR_MARK;
560 return false;
561 }
562
563 void ir3_block_clear_mark(struct ir3_block *block);
564 void ir3_clear_mark(struct ir3 *shader);
565
566 unsigned ir3_count_instructions(struct ir3 *ir);
567
568 static inline int ir3_instr_regno(struct ir3_instruction *instr,
569 struct ir3_register *reg)
570 {
571 unsigned i;
572 for (i = 0; i < instr->regs_count; i++)
573 if (reg == instr->regs[i])
574 return i;
575 return -1;
576 }
577
578
579 #define MAX_ARRAYS 16
580
581 /* comp:
582 * 0 - x
583 * 1 - y
584 * 2 - z
585 * 3 - w
586 */
587 static inline uint32_t regid(int num, int comp)
588 {
589 return (num << 2) | (comp & 0x3);
590 }
591
592 static inline uint32_t reg_num(struct ir3_register *reg)
593 {
594 return reg->num >> 2;
595 }
596
597 static inline uint32_t reg_comp(struct ir3_register *reg)
598 {
599 return reg->num & 0x3;
600 }
601
602 static inline bool is_flow(struct ir3_instruction *instr)
603 {
604 return (opc_cat(instr->opc) == 0);
605 }
606
607 static inline bool is_kill(struct ir3_instruction *instr)
608 {
609 return instr->opc == OPC_KILL;
610 }
611
612 static inline bool is_nop(struct ir3_instruction *instr)
613 {
614 return instr->opc == OPC_NOP;
615 }
616
617 /* Is it a non-transformative (ie. not type changing) mov? This can
618 * also include absneg.s/absneg.f, which for the most part can be
619 * treated as a mov (single src argument).
620 */
621 static inline bool is_same_type_mov(struct ir3_instruction *instr)
622 {
623 struct ir3_register *dst;
624
625 switch (instr->opc) {
626 case OPC_MOV:
627 if (instr->cat1.src_type != instr->cat1.dst_type)
628 return false;
629 break;
630 case OPC_ABSNEG_F:
631 case OPC_ABSNEG_S:
632 if (instr->flags & IR3_INSTR_SAT)
633 return false;
634 break;
635 default:
636 return false;
637 }
638
639 dst = instr->regs[0];
640
641 /* mov's that write to a0.x or p0.x are special: */
642 if (dst->num == regid(REG_P0, 0))
643 return false;
644 if (dst->num == regid(REG_A0, 0))
645 return false;
646
647 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
648 return false;
649
650 return true;
651 }
652
653 static inline bool is_alu(struct ir3_instruction *instr)
654 {
655 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
656 }
657
658 static inline bool is_sfu(struct ir3_instruction *instr)
659 {
660 return (opc_cat(instr->opc) == 4);
661 }
662
663 static inline bool is_tex(struct ir3_instruction *instr)
664 {
665 return (opc_cat(instr->opc) == 5);
666 }
667
668 static inline bool is_mem(struct ir3_instruction *instr)
669 {
670 return (opc_cat(instr->opc) == 6);
671 }
672
673 static inline bool is_barrier(struct ir3_instruction *instr)
674 {
675 return (opc_cat(instr->opc) == 7);
676 }
677
678 static inline bool
679 is_store(struct ir3_instruction *instr)
680 {
681 /* these instructions, the "destination" register is
682 * actually a source, the address to store to.
683 */
684 switch (instr->opc) {
685 case OPC_STG:
686 case OPC_STGB:
687 case OPC_STIB:
688 case OPC_STP:
689 case OPC_STL:
690 case OPC_STLW:
691 case OPC_L2G:
692 case OPC_G2L:
693 return true;
694 default:
695 return false;
696 }
697 }
698
699 static inline bool is_load(struct ir3_instruction *instr)
700 {
701 switch (instr->opc) {
702 case OPC_LDG:
703 case OPC_LDGB:
704 case OPC_LDIB:
705 case OPC_LDL:
706 case OPC_LDP:
707 case OPC_L2G:
708 case OPC_LDLW:
709 case OPC_LDC:
710 case OPC_LDLV:
711 /* probably some others too.. */
712 return true;
713 default:
714 return false;
715 }
716 }
717
718 static inline bool is_input(struct ir3_instruction *instr)
719 {
720 /* in some cases, ldlv is used to fetch varying without
721 * interpolation.. fortunately inloc is the first src
722 * register in either case
723 */
724 switch (instr->opc) {
725 case OPC_LDLV:
726 case OPC_BARY_F:
727 return true;
728 default:
729 return false;
730 }
731 }
732
733 static inline bool is_bool(struct ir3_instruction *instr)
734 {
735 switch (instr->opc) {
736 case OPC_CMPS_F:
737 case OPC_CMPS_S:
738 case OPC_CMPS_U:
739 return true;
740 default:
741 return false;
742 }
743 }
744
745 static inline bool is_meta(struct ir3_instruction *instr)
746 {
747 /* TODO how should we count PHI (and maybe fan-in/out) which
748 * might actually contribute some instructions to the final
749 * result?
750 */
751 return (opc_cat(instr->opc) == -1);
752 }
753
754 static inline unsigned dest_regs(struct ir3_instruction *instr)
755 {
756 if ((instr->regs_count == 0) || is_store(instr))
757 return 0;
758
759 return util_last_bit(instr->regs[0]->wrmask);
760 }
761
762 static inline bool writes_addr(struct ir3_instruction *instr)
763 {
764 if (instr->regs_count > 0) {
765 struct ir3_register *dst = instr->regs[0];
766 return reg_num(dst) == REG_A0;
767 }
768 return false;
769 }
770
771 static inline bool writes_pred(struct ir3_instruction *instr)
772 {
773 if (instr->regs_count > 0) {
774 struct ir3_register *dst = instr->regs[0];
775 return reg_num(dst) == REG_P0;
776 }
777 return false;
778 }
779
780 /* returns defining instruction for reg */
781 /* TODO better name */
782 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
783 {
784 if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
785 return reg->instr;
786 }
787 return NULL;
788 }
789
790 static inline bool conflicts(struct ir3_instruction *a,
791 struct ir3_instruction *b)
792 {
793 return (a && b) && (a != b);
794 }
795
796 static inline bool reg_gpr(struct ir3_register *r)
797 {
798 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
799 return false;
800 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
801 return false;
802 return true;
803 }
804
805 static inline type_t half_type(type_t type)
806 {
807 switch (type) {
808 case TYPE_F32: return TYPE_F16;
809 case TYPE_U32: return TYPE_U16;
810 case TYPE_S32: return TYPE_S16;
811 case TYPE_F16:
812 case TYPE_U16:
813 case TYPE_S16:
814 return type;
815 default:
816 assert(0);
817 return ~0;
818 }
819 }
820
821 /* some cat2 instructions (ie. those which are not float) can embed an
822 * immediate:
823 */
824 static inline bool ir3_cat2_int(opc_t opc)
825 {
826 switch (opc) {
827 case OPC_ADD_U:
828 case OPC_ADD_S:
829 case OPC_SUB_U:
830 case OPC_SUB_S:
831 case OPC_CMPS_U:
832 case OPC_CMPS_S:
833 case OPC_MIN_U:
834 case OPC_MIN_S:
835 case OPC_MAX_U:
836 case OPC_MAX_S:
837 case OPC_CMPV_U:
838 case OPC_CMPV_S:
839 case OPC_MUL_U:
840 case OPC_MUL_S:
841 case OPC_MULL_U:
842 case OPC_CLZ_S:
843 case OPC_ABSNEG_S:
844 case OPC_AND_B:
845 case OPC_OR_B:
846 case OPC_NOT_B:
847 case OPC_XOR_B:
848 case OPC_BFREV_B:
849 case OPC_CLZ_B:
850 case OPC_SHL_B:
851 case OPC_SHR_B:
852 case OPC_ASHR_B:
853 case OPC_MGEN_B:
854 case OPC_GETBIT_B:
855 case OPC_CBITS_B:
856 case OPC_BARY_F:
857 return true;
858
859 default:
860 return false;
861 }
862 }
863
864
865 /* map cat2 instruction to valid abs/neg flags: */
866 static inline unsigned ir3_cat2_absneg(opc_t opc)
867 {
868 switch (opc) {
869 case OPC_ADD_F:
870 case OPC_MIN_F:
871 case OPC_MAX_F:
872 case OPC_MUL_F:
873 case OPC_SIGN_F:
874 case OPC_CMPS_F:
875 case OPC_ABSNEG_F:
876 case OPC_CMPV_F:
877 case OPC_FLOOR_F:
878 case OPC_CEIL_F:
879 case OPC_RNDNE_F:
880 case OPC_RNDAZ_F:
881 case OPC_TRUNC_F:
882 case OPC_BARY_F:
883 return IR3_REG_FABS | IR3_REG_FNEG;
884
885 case OPC_ADD_U:
886 case OPC_ADD_S:
887 case OPC_SUB_U:
888 case OPC_SUB_S:
889 case OPC_CMPS_U:
890 case OPC_CMPS_S:
891 case OPC_MIN_U:
892 case OPC_MIN_S:
893 case OPC_MAX_U:
894 case OPC_MAX_S:
895 case OPC_CMPV_U:
896 case OPC_CMPV_S:
897 case OPC_MUL_U:
898 case OPC_MUL_S:
899 case OPC_MULL_U:
900 case OPC_CLZ_S:
901 return 0;
902
903 case OPC_ABSNEG_S:
904 return IR3_REG_SABS | IR3_REG_SNEG;
905
906 case OPC_AND_B:
907 case OPC_OR_B:
908 case OPC_NOT_B:
909 case OPC_XOR_B:
910 case OPC_BFREV_B:
911 case OPC_CLZ_B:
912 case OPC_SHL_B:
913 case OPC_SHR_B:
914 case OPC_ASHR_B:
915 case OPC_MGEN_B:
916 case OPC_GETBIT_B:
917 case OPC_CBITS_B:
918 return IR3_REG_BNOT;
919
920 default:
921 return 0;
922 }
923 }
924
925 /* map cat3 instructions to valid abs/neg flags: */
926 static inline unsigned ir3_cat3_absneg(opc_t opc)
927 {
928 switch (opc) {
929 case OPC_MAD_F16:
930 case OPC_MAD_F32:
931 case OPC_SEL_F16:
932 case OPC_SEL_F32:
933 return IR3_REG_FNEG;
934
935 case OPC_MAD_U16:
936 case OPC_MADSH_U16:
937 case OPC_MAD_S16:
938 case OPC_MADSH_M16:
939 case OPC_MAD_U24:
940 case OPC_MAD_S24:
941 case OPC_SEL_S16:
942 case OPC_SEL_S32:
943 case OPC_SAD_S16:
944 case OPC_SAD_S32:
945 /* neg *may* work on 3rd src.. */
946
947 case OPC_SEL_B16:
948 case OPC_SEL_B32:
949
950 default:
951 return 0;
952 }
953 }
954
955 #define MASK(n) ((1 << (n)) - 1)
956
957 /* iterator for an instructions's sources (reg), also returns src #: */
958 #define foreach_src_n(__srcreg, __n, __instr) \
959 if ((__instr)->regs_count) \
960 for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
961 if ((__srcreg = (__instr)->regs[__n + 1]))
962
963 /* iterator for an instructions's sources (reg): */
964 #define foreach_src(__srcreg, __instr) \
965 foreach_src_n(__srcreg, __i, __instr)
966
967 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
968 {
969 unsigned cnt = instr->regs_count + instr->deps_count;
970 if (instr->address)
971 cnt++;
972 return cnt;
973 }
974
975 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
976 {
977 if (n == (instr->regs_count + instr->deps_count))
978 return instr->address;
979 if (n >= instr->regs_count)
980 return instr->deps[n - instr->regs_count];
981 return ssa(instr->regs[n]);
982 }
983
984 static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
985 {
986 if (n == (instr->regs_count + instr->deps_count))
987 return false;
988 if (n >= instr->regs_count)
989 return true;
990 return false;
991 }
992
993 #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
994
995 /* iterator for an instruction's SSA sources (instr), also returns src #: */
996 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
997 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
998 if ((__srcinst = __ssa_src_n(__instr, __n)))
999
1000 /* iterator for an instruction's SSA sources (instr): */
1001 #define foreach_ssa_src(__srcinst, __instr) \
1002 foreach_ssa_src_n(__srcinst, __i, __instr)
1003
1004
1005 /* dump: */
1006 void ir3_print(struct ir3 *ir);
1007 void ir3_print_instr(struct ir3_instruction *instr);
1008
1009 /* depth calculation: */
1010 int ir3_delayslots(struct ir3_instruction *assigner,
1011 struct ir3_instruction *consumer, unsigned n);
1012 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
1013 void ir3_depth(struct ir3 *ir);
1014
1015 /* copy-propagate: */
1016 struct ir3_shader_variant;
1017 void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1018
1019 /* group neighbors and insert mov's to resolve conflicts: */
1020 void ir3_group(struct ir3 *ir);
1021
1022 /* Sethi–Ullman numbering: */
1023 void ir3_sun(struct ir3 *ir);
1024
1025 /* scheduling: */
1026 void ir3_sched_add_deps(struct ir3 *ir);
1027 int ir3_sched(struct ir3 *ir);
1028
1029 void ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
1030
1031 /* register assignment: */
1032 struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
1033 int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
1034 bool frag_coord, bool frag_face);
1035
1036 /* legalize: */
1037 void ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary);
1038
1039 /* ************************************************************************* */
1040 /* instruction helpers */
1041
1042 static inline struct ir3_instruction *
1043 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1044 {
1045 struct ir3_instruction *mov;
1046 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1047
1048 mov = ir3_instr_create(block, OPC_MOV);
1049 mov->cat1.src_type = type;
1050 mov->cat1.dst_type = type;
1051 ir3_reg_create(mov, 0, flags);
1052 ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
1053
1054 return mov;
1055 }
1056
1057 static inline struct ir3_instruction *
1058 create_immed(struct ir3_block *block, uint32_t val)
1059 {
1060 return create_immed_typed(block, val, TYPE_U32);
1061 }
1062
1063 static inline struct ir3_instruction *
1064 create_uniform(struct ir3_block *block, unsigned n)
1065 {
1066 struct ir3_instruction *mov;
1067
1068 mov = ir3_instr_create(block, OPC_MOV);
1069 /* TODO get types right? */
1070 mov->cat1.src_type = TYPE_F32;
1071 mov->cat1.dst_type = TYPE_F32;
1072 ir3_reg_create(mov, 0, 0);
1073 ir3_reg_create(mov, n, IR3_REG_CONST);
1074
1075 return mov;
1076 }
1077
1078 static inline struct ir3_instruction *
1079 create_uniform_indirect(struct ir3_block *block, int n,
1080 struct ir3_instruction *address)
1081 {
1082 struct ir3_instruction *mov;
1083
1084 mov = ir3_instr_create(block, OPC_MOV);
1085 mov->cat1.src_type = TYPE_U32;
1086 mov->cat1.dst_type = TYPE_U32;
1087 ir3_reg_create(mov, 0, 0);
1088 ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1089
1090 ir3_instr_set_address(mov, address);
1091
1092 return mov;
1093 }
1094
1095 /* creates SSA src of correct type (ie. half vs full precision) */
1096 static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
1097 struct ir3_instruction *src, unsigned flags)
1098 {
1099 struct ir3_register *reg;
1100 if (src->regs[0]->flags & IR3_REG_HALF)
1101 flags |= IR3_REG_HALF;
1102 reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
1103 reg->instr = src;
1104 reg->wrmask = src->regs[0]->wrmask;
1105 return reg;
1106 }
1107
1108 static inline struct ir3_instruction *
1109 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1110 {
1111 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1112 ir3_reg_create(instr, 0, 0); /* dst */
1113 if (src->regs[0]->flags & IR3_REG_ARRAY) {
1114 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1115 src_reg->array = src->regs[0]->array;
1116 } else {
1117 __ssa_src(instr, src, src->regs[0]->flags & IR3_REG_HIGH);
1118 }
1119 debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
1120 instr->cat1.src_type = type;
1121 instr->cat1.dst_type = type;
1122 return instr;
1123 }
1124
1125 static inline struct ir3_instruction *
1126 ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
1127 type_t src_type, type_t dst_type)
1128 {
1129 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1130 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
1131 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
1132
1133 debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
1134
1135 ir3_reg_create(instr, 0, dst_flags); /* dst */
1136 __ssa_src(instr, src, 0);
1137 instr->cat1.src_type = src_type;
1138 instr->cat1.dst_type = dst_type;
1139 debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
1140 return instr;
1141 }
1142
1143 static inline struct ir3_instruction *
1144 ir3_NOP(struct ir3_block *block)
1145 {
1146 return ir3_instr_create(block, OPC_NOP);
1147 }
1148
1149 #define IR3_INSTR_0 0
1150
1151 #define __INSTR0(flag, name, opc) \
1152 static inline struct ir3_instruction * \
1153 ir3_##name(struct ir3_block *block) \
1154 { \
1155 struct ir3_instruction *instr = \
1156 ir3_instr_create(block, opc); \
1157 instr->flags |= flag; \
1158 return instr; \
1159 }
1160 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
1161 #define INSTR0(name) __INSTR0(0, name, OPC_##name)
1162
1163 #define __INSTR1(flag, name, opc) \
1164 static inline struct ir3_instruction * \
1165 ir3_##name(struct ir3_block *block, \
1166 struct ir3_instruction *a, unsigned aflags) \
1167 { \
1168 struct ir3_instruction *instr = \
1169 ir3_instr_create(block, opc); \
1170 ir3_reg_create(instr, 0, 0); /* dst */ \
1171 __ssa_src(instr, a, aflags); \
1172 instr->flags |= flag; \
1173 return instr; \
1174 }
1175 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, name##_##f, OPC_##name)
1176 #define INSTR1(name) __INSTR1(0, name, OPC_##name)
1177
1178 #define __INSTR2(flag, name, opc) \
1179 static inline struct ir3_instruction * \
1180 ir3_##name(struct ir3_block *block, \
1181 struct ir3_instruction *a, unsigned aflags, \
1182 struct ir3_instruction *b, unsigned bflags) \
1183 { \
1184 struct ir3_instruction *instr = \
1185 ir3_instr_create(block, opc); \
1186 ir3_reg_create(instr, 0, 0); /* dst */ \
1187 __ssa_src(instr, a, aflags); \
1188 __ssa_src(instr, b, bflags); \
1189 instr->flags |= flag; \
1190 return instr; \
1191 }
1192 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name)
1193 #define INSTR2(name) __INSTR2(0, name, OPC_##name)
1194
1195 #define __INSTR3(flag, name, opc) \
1196 static inline struct ir3_instruction * \
1197 ir3_##name(struct ir3_block *block, \
1198 struct ir3_instruction *a, unsigned aflags, \
1199 struct ir3_instruction *b, unsigned bflags, \
1200 struct ir3_instruction *c, unsigned cflags) \
1201 { \
1202 struct ir3_instruction *instr = \
1203 ir3_instr_create2(block, opc, 4); \
1204 ir3_reg_create(instr, 0, 0); /* dst */ \
1205 __ssa_src(instr, a, aflags); \
1206 __ssa_src(instr, b, bflags); \
1207 __ssa_src(instr, c, cflags); \
1208 instr->flags |= flag; \
1209 return instr; \
1210 }
1211 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, name##_##f, OPC_##name)
1212 #define INSTR3(name) __INSTR3(0, name, OPC_##name)
1213
1214 #define __INSTR4(flag, name, opc) \
1215 static inline struct ir3_instruction * \
1216 ir3_##name(struct ir3_block *block, \
1217 struct ir3_instruction *a, unsigned aflags, \
1218 struct ir3_instruction *b, unsigned bflags, \
1219 struct ir3_instruction *c, unsigned cflags, \
1220 struct ir3_instruction *d, unsigned dflags) \
1221 { \
1222 struct ir3_instruction *instr = \
1223 ir3_instr_create2(block, opc, 5); \
1224 ir3_reg_create(instr, 0, 0); /* dst */ \
1225 __ssa_src(instr, a, aflags); \
1226 __ssa_src(instr, b, bflags); \
1227 __ssa_src(instr, c, cflags); \
1228 __ssa_src(instr, d, dflags); \
1229 instr->flags |= flag; \
1230 return instr; \
1231 }
1232 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, name##_##f, OPC_##name)
1233 #define INSTR4(name) __INSTR4(0, name, OPC_##name)
1234
1235 /* cat0 instructions: */
1236 INSTR0(BR)
1237 INSTR0(JUMP)
1238 INSTR1(KILL)
1239 INSTR0(END)
1240
1241 /* cat2 instructions, most 2 src but some 1 src: */
1242 INSTR2(ADD_F)
1243 INSTR2(MIN_F)
1244 INSTR2(MAX_F)
1245 INSTR2(MUL_F)
1246 INSTR1(SIGN_F)
1247 INSTR2(CMPS_F)
1248 INSTR1(ABSNEG_F)
1249 INSTR2(CMPV_F)
1250 INSTR1(FLOOR_F)
1251 INSTR1(CEIL_F)
1252 INSTR1(RNDNE_F)
1253 INSTR1(RNDAZ_F)
1254 INSTR1(TRUNC_F)
1255 INSTR2(ADD_U)
1256 INSTR2(ADD_S)
1257 INSTR2(SUB_U)
1258 INSTR2(SUB_S)
1259 INSTR2(CMPS_U)
1260 INSTR2(CMPS_S)
1261 INSTR2(MIN_U)
1262 INSTR2(MIN_S)
1263 INSTR2(MAX_U)
1264 INSTR2(MAX_S)
1265 INSTR1(ABSNEG_S)
1266 INSTR2(AND_B)
1267 INSTR2(OR_B)
1268 INSTR1(NOT_B)
1269 INSTR2(XOR_B)
1270 INSTR2(CMPV_U)
1271 INSTR2(CMPV_S)
1272 INSTR2(MUL_U)
1273 INSTR2(MUL_S)
1274 INSTR2(MULL_U)
1275 INSTR1(BFREV_B)
1276 INSTR1(CLZ_S)
1277 INSTR1(CLZ_B)
1278 INSTR2(SHL_B)
1279 INSTR2(SHR_B)
1280 INSTR2(ASHR_B)
1281 INSTR2(BARY_F)
1282 INSTR2(MGEN_B)
1283 INSTR2(GETBIT_B)
1284 INSTR1(SETRM)
1285 INSTR1(CBITS_B)
1286 INSTR2(SHB)
1287 INSTR2(MSAD)
1288
1289 /* cat3 instructions: */
1290 INSTR3(MAD_U16)
1291 INSTR3(MADSH_U16)
1292 INSTR3(MAD_S16)
1293 INSTR3(MADSH_M16)
1294 INSTR3(MAD_U24)
1295 INSTR3(MAD_S24)
1296 INSTR3(MAD_F16)
1297 INSTR3(MAD_F32)
1298 INSTR3(SEL_B16)
1299 INSTR3(SEL_B32)
1300 INSTR3(SEL_S16)
1301 INSTR3(SEL_S32)
1302 INSTR3(SEL_F16)
1303 INSTR3(SEL_F32)
1304 INSTR3(SAD_S16)
1305 INSTR3(SAD_S32)
1306
1307 /* cat4 instructions: */
1308 INSTR1(RCP)
1309 INSTR1(RSQ)
1310 INSTR1(LOG2)
1311 INSTR1(EXP2)
1312 INSTR1(SIN)
1313 INSTR1(COS)
1314 INSTR1(SQRT)
1315
1316 /* cat5 instructions: */
1317 INSTR1(DSX)
1318 INSTR1(DSY)
1319 INSTR1F(3D, DSX)
1320 INSTR1F(3D, DSY)
1321 INSTR1(RGETPOS)
1322
1323 static inline struct ir3_instruction *
1324 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
1325 unsigned wrmask, unsigned flags, struct ir3_instruction *samp_tex,
1326 struct ir3_instruction *src0, struct ir3_instruction *src1)
1327 {
1328 struct ir3_instruction *sam;
1329 struct ir3_register *reg;
1330
1331 sam = ir3_instr_create(block, opc);
1332 sam->flags |= flags | IR3_INSTR_S2EN;
1333 ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
1334 __ssa_src(sam, samp_tex, IR3_REG_HALF);
1335 if (src0) {
1336 reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
1337 reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
1338 reg->instr = src0;
1339 }
1340 if (src1) {
1341 reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
1342 reg->instr = src1;
1343 reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
1344 }
1345 sam->cat5.type = type;
1346
1347 return sam;
1348 }
1349
1350 /* cat6 instructions: */
1351 INSTR2(LDLV)
1352 INSTR2(LDG)
1353 INSTR2(LDL)
1354 INSTR3(STG)
1355 INSTR3(STL)
1356 INSTR1(RESINFO)
1357 INSTR1(RESFMT)
1358 INSTR2(ATOMIC_ADD)
1359 INSTR2(ATOMIC_SUB)
1360 INSTR2(ATOMIC_XCHG)
1361 INSTR2(ATOMIC_INC)
1362 INSTR2(ATOMIC_DEC)
1363 INSTR2(ATOMIC_CMPXCHG)
1364 INSTR2(ATOMIC_MIN)
1365 INSTR2(ATOMIC_MAX)
1366 INSTR2(ATOMIC_AND)
1367 INSTR2(ATOMIC_OR)
1368 INSTR2(ATOMIC_XOR)
1369 #if GPU >= 600
1370 INSTR3(STIB);
1371 INSTR2(LDIB);
1372 INSTR3F(G, ATOMIC_ADD)
1373 INSTR3F(G, ATOMIC_SUB)
1374 INSTR3F(G, ATOMIC_XCHG)
1375 INSTR3F(G, ATOMIC_INC)
1376 INSTR3F(G, ATOMIC_DEC)
1377 INSTR3F(G, ATOMIC_CMPXCHG)
1378 INSTR3F(G, ATOMIC_MIN)
1379 INSTR3F(G, ATOMIC_MAX)
1380 INSTR3F(G, ATOMIC_AND)
1381 INSTR3F(G, ATOMIC_OR)
1382 INSTR3F(G, ATOMIC_XOR)
1383 #elif GPU >= 400
1384 INSTR3(LDGB)
1385 INSTR4(STGB)
1386 INSTR4(STIB)
1387 INSTR4F(G, ATOMIC_ADD)
1388 INSTR4F(G, ATOMIC_SUB)
1389 INSTR4F(G, ATOMIC_XCHG)
1390 INSTR4F(G, ATOMIC_INC)
1391 INSTR4F(G, ATOMIC_DEC)
1392 INSTR4F(G, ATOMIC_CMPXCHG)
1393 INSTR4F(G, ATOMIC_MIN)
1394 INSTR4F(G, ATOMIC_MAX)
1395 INSTR4F(G, ATOMIC_AND)
1396 INSTR4F(G, ATOMIC_OR)
1397 INSTR4F(G, ATOMIC_XOR)
1398 #endif
1399
1400 /* cat7 instructions: */
1401 INSTR0(BAR)
1402 INSTR0(FENCE)
1403
1404 /* ************************************************************************* */
1405 /* split this out or find some helper to use.. like main/bitset.h.. */
1406
1407 #include <string.h>
1408
1409 #define MAX_REG 256
1410
1411 typedef uint8_t regmask_t[2 * MAX_REG / 8];
1412
1413 static inline unsigned regmask_idx(struct ir3_register *reg)
1414 {
1415 unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
1416 debug_assert(num < MAX_REG);
1417 if (reg->flags & IR3_REG_HALF) {
1418 if (reg->merged) {
1419 num /= 2;
1420 } else {
1421 num += MAX_REG;
1422 }
1423 }
1424 return num;
1425 }
1426
1427 static inline void regmask_init(regmask_t *regmask)
1428 {
1429 memset(regmask, 0, sizeof(*regmask));
1430 }
1431
1432 static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
1433 {
1434 unsigned idx = regmask_idx(reg);
1435 if (reg->flags & IR3_REG_RELATIV) {
1436 unsigned i;
1437 for (i = 0; i < reg->size; i++, idx++)
1438 (*regmask)[idx / 8] |= 1 << (idx % 8);
1439 } else {
1440 unsigned mask;
1441 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1442 if (mask & 1)
1443 (*regmask)[idx / 8] |= 1 << (idx % 8);
1444 }
1445 }
1446
1447 static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
1448 {
1449 unsigned i;
1450 for (i = 0; i < ARRAY_SIZE(*dst); i++)
1451 (*dst)[i] = (*a)[i] | (*b)[i];
1452 }
1453
1454 /* set bits in a if not set in b, conceptually:
1455 * a |= (reg & ~b)
1456 */
1457 static inline void regmask_set_if_not(regmask_t *a,
1458 struct ir3_register *reg, regmask_t *b)
1459 {
1460 unsigned idx = regmask_idx(reg);
1461 if (reg->flags & IR3_REG_RELATIV) {
1462 unsigned i;
1463 for (i = 0; i < reg->size; i++, idx++)
1464 if (!((*b)[idx / 8] & (1 << (idx % 8))))
1465 (*a)[idx / 8] |= 1 << (idx % 8);
1466 } else {
1467 unsigned mask;
1468 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1469 if (mask & 1)
1470 if (!((*b)[idx / 8] & (1 << (idx % 8))))
1471 (*a)[idx / 8] |= 1 << (idx % 8);
1472 }
1473 }
1474
1475 static inline bool regmask_get(regmask_t *regmask,
1476 struct ir3_register *reg)
1477 {
1478 unsigned idx = regmask_idx(reg);
1479 if (reg->flags & IR3_REG_RELATIV) {
1480 unsigned i;
1481 for (i = 0; i < reg->size; i++, idx++)
1482 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
1483 return true;
1484 } else {
1485 unsigned mask;
1486 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1487 if (mask & 1)
1488 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
1489 return true;
1490 }
1491 return false;
1492 }
1493
1494 /* ************************************************************************* */
1495
1496 #endif /* IR3_H_ */