4bd7601b8dd4f79b9d1e24e8970b72ebb3eb12ac
[mesa.git] / src / freedreno / ir3 / ir3.h
1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef IR3_H_
25 #define IR3_H_
26
27 #include <stdint.h>
28 #include <stdbool.h>
29
30 #include "compiler/shader_enums.h"
31
32 #include "util/bitscan.h"
33 #include "util/list.h"
34 #include "util/u_debug.h"
35
36 #include "instr-a3xx.h"
37
38 /* low level intermediate representation of an adreno shader program */
39
40 struct ir3_compiler;
41 struct ir3;
42 struct ir3_instruction;
43 struct ir3_block;
44
45 struct ir3_info {
46 uint32_t gpu_id;
47 uint16_t sizedwords;
48 uint16_t instrs_count; /* expanded to account for rpt's */
49 /* NOTE: max_reg, etc, does not include registers not touched
50 * by the shader (ie. vertex fetched via VFD_DECODE but not
51 * touched by shader)
52 */
53 int8_t max_reg; /* highest GPR # used by shader */
54 int8_t max_half_reg;
55 int16_t max_const;
56
57 /* number of sync bits: */
58 uint16_t ss, sy;
59 };
60
61 struct ir3_register {
62 enum {
63 IR3_REG_CONST = 0x001,
64 IR3_REG_IMMED = 0x002,
65 IR3_REG_HALF = 0x004,
66 /* high registers are used for some things in compute shaders,
67 * for example. Seems to be for things that are global to all
68 * threads in a wave, so possibly these are global/shared by
69 * all the threads in the wave?
70 */
71 IR3_REG_HIGH = 0x008,
72 IR3_REG_RELATIV= 0x010,
73 IR3_REG_R = 0x020,
74 /* Most instructions, it seems, can do float abs/neg but not
75 * integer. The CP pass needs to know what is intended (int or
76 * float) in order to do the right thing. For this reason the
77 * abs/neg flags are split out into float and int variants. In
78 * addition, .b (bitwise) operations, the negate is actually a
79 * bitwise not, so split that out into a new flag to make it
80 * more clear.
81 */
82 IR3_REG_FNEG = 0x040,
83 IR3_REG_FABS = 0x080,
84 IR3_REG_SNEG = 0x100,
85 IR3_REG_SABS = 0x200,
86 IR3_REG_BNOT = 0x400,
87 IR3_REG_EVEN = 0x800,
88 IR3_REG_POS_INF= 0x1000,
89 /* (ei) flag, end-input? Set on last bary, presumably to signal
90 * that the shader needs no more input:
91 */
92 IR3_REG_EI = 0x2000,
93 /* meta-flags, for intermediate stages of IR, ie.
94 * before register assignment is done:
95 */
96 IR3_REG_SSA = 0x4000, /* 'instr' is ptr to assigning instr */
97 IR3_REG_ARRAY = 0x8000,
98
99 } flags;
100
101 /* normal registers:
102 * the component is in the low two bits of the reg #, so
103 * rN.x becomes: (N << 2) | x
104 */
105 int num;
106 union {
107 /* immediate: */
108 int32_t iim_val;
109 uint32_t uim_val;
110 float fim_val;
111 /* relative: */
112 struct {
113 uint16_t id;
114 int16_t offset;
115 } array;
116 };
117
118 /* For IR3_REG_SSA, src registers contain ptr back to assigning
119 * instruction.
120 *
121 * For IR3_REG_ARRAY, the pointer is back to the last dependent
122 * array access (although the net effect is the same, it points
123 * back to a previous instruction that we depend on).
124 */
125 struct ir3_instruction *instr;
126
127 union {
128 /* used for cat5 instructions, but also for internal/IR level
129 * tracking of what registers are read/written by an instruction.
130 * wrmask may be a bad name since it is used to represent both
131 * src and dst that touch multiple adjacent registers.
132 */
133 unsigned wrmask;
134 /* for relative addressing, 32bits for array size is too small,
135 * but otoh we don't need to deal with disjoint sets, so instead
136 * use a simple size field (number of scalar components).
137 */
138 unsigned size;
139 };
140 };
141
142 /*
143 * Stupid/simple growable array implementation:
144 */
145 #define DECLARE_ARRAY(type, name) \
146 unsigned name ## _count, name ## _sz; \
147 type * name;
148
149 #define array_insert(ctx, arr, val) do { \
150 if (arr ## _count == arr ## _sz) { \
151 arr ## _sz = MAX2(2 * arr ## _sz, 16); \
152 arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
153 } \
154 arr[arr ##_count++] = val; \
155 } while (0)
156
157 struct ir3_instruction {
158 struct ir3_block *block;
159 opc_t opc;
160 enum {
161 /* (sy) flag is set on first instruction, and after sample
162 * instructions (probably just on RAW hazard).
163 */
164 IR3_INSTR_SY = 0x001,
165 /* (ss) flag is set on first instruction, and first instruction
166 * to depend on the result of "long" instructions (RAW hazard):
167 *
168 * rcp, rsq, log2, exp2, sin, cos, sqrt
169 *
170 * It seems to synchronize until all in-flight instructions are
171 * completed, for example:
172 *
173 * rsq hr1.w, hr1.w
174 * add.f hr2.z, (neg)hr2.z, hc0.y
175 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
176 * rsq hr2.x, hr2.x
177 * (rpt1)nop
178 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
179 * nop
180 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
181 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
182 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
183 *
184 * The last mul.f does not have (ss) set, presumably because the
185 * (ss) on the previous instruction does the job.
186 *
187 * The blob driver also seems to set it on WAR hazards, although
188 * not really clear if this is needed or just blob compiler being
189 * sloppy. So far I haven't found a case where removing the (ss)
190 * causes problems for WAR hazard, but I could just be getting
191 * lucky:
192 *
193 * rcp r1.y, r3.y
194 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
195 *
196 */
197 IR3_INSTR_SS = 0x002,
198 /* (jp) flag is set on jump targets:
199 */
200 IR3_INSTR_JP = 0x004,
201 IR3_INSTR_UL = 0x008,
202 IR3_INSTR_3D = 0x010,
203 IR3_INSTR_A = 0x020,
204 IR3_INSTR_O = 0x040,
205 IR3_INSTR_P = 0x080,
206 IR3_INSTR_S = 0x100,
207 IR3_INSTR_S2EN = 0x200,
208 IR3_INSTR_G = 0x400,
209 IR3_INSTR_SAT = 0x800,
210 /* meta-flags, for intermediate stages of IR, ie.
211 * before register assignment is done:
212 */
213 IR3_INSTR_MARK = 0x1000,
214 IR3_INSTR_UNUSED= 0x2000,
215 } flags;
216 uint8_t repeat;
217 uint8_t nop;
218 #ifdef DEBUG
219 unsigned regs_max;
220 #endif
221 unsigned regs_count;
222 struct ir3_register **regs;
223 union {
224 struct {
225 char inv;
226 char comp;
227 int immed;
228 struct ir3_block *target;
229 } cat0;
230 struct {
231 type_t src_type, dst_type;
232 } cat1;
233 struct {
234 enum {
235 IR3_COND_LT = 0,
236 IR3_COND_LE = 1,
237 IR3_COND_GT = 2,
238 IR3_COND_GE = 3,
239 IR3_COND_EQ = 4,
240 IR3_COND_NE = 5,
241 } condition;
242 } cat2;
243 struct {
244 unsigned samp, tex;
245 type_t type;
246 } cat5;
247 struct {
248 type_t type;
249 int src_offset;
250 int dst_offset;
251 int iim_val : 3; /* for ldgb/stgb, # of components */
252 unsigned d : 3;
253 bool typed : 1;
254 } cat6;
255 struct {
256 unsigned w : 1; /* write */
257 unsigned r : 1; /* read */
258 unsigned l : 1; /* local */
259 unsigned g : 1; /* global */
260 } cat7;
261 /* for meta-instructions, just used to hold extra data
262 * before instruction scheduling, etc
263 */
264 struct {
265 int off; /* component/offset */
266 } fo;
267 struct {
268 struct ir3_block *block;
269 } inout;
270 };
271
272 /* transient values used during various algorithms: */
273 union {
274 /* The instruction depth is the max dependency distance to output.
275 *
276 * You can also think of it as the "cost", if we did any sort of
277 * optimization for register footprint. Ie. a value that is just
278 * result of moving a const to a reg would have a low cost, so to
279 * it could make sense to duplicate the instruction at various
280 * points where the result is needed to reduce register footprint.
281 */
282 unsigned depth;
283 /* When we get to the RA stage, we no longer need depth, but
284 * we do need instruction's position/name:
285 */
286 struct {
287 uint16_t ip;
288 uint16_t name;
289 };
290 };
291
292 /* used for per-pass extra instruction data.
293 */
294 void *data;
295
296 int sun; /* Sethi–Ullman number, used by sched */
297 int use_count; /* currently just updated/used by cp */
298
299 /* Used during CP and RA stages. For fanin and shader inputs/
300 * outputs where we need a sequence of consecutive registers,
301 * keep track of each src instructions left (ie 'n-1') and right
302 * (ie 'n+1') neighbor. The front-end must insert enough mov's
303 * to ensure that each instruction has at most one left and at
304 * most one right neighbor. During the copy-propagation pass,
305 * we only remove mov's when we can preserve this constraint.
306 * And during the RA stage, we use the neighbor information to
307 * allocate a block of registers in one shot.
308 *
309 * TODO: maybe just add something like:
310 * struct ir3_instruction_ref {
311 * struct ir3_instruction *instr;
312 * unsigned cnt;
313 * }
314 *
315 * Or can we get away without the refcnt stuff? It seems like
316 * it should be overkill.. the problem is if, potentially after
317 * already eliminating some mov's, if you have a single mov that
318 * needs to be grouped with it's neighbors in two different
319 * places (ex. shader output and a fanin).
320 */
321 struct {
322 struct ir3_instruction *left, *right;
323 uint16_t left_cnt, right_cnt;
324 } cp;
325
326 /* an instruction can reference at most one address register amongst
327 * it's src/dst registers. Beyond that, you need to insert mov's.
328 *
329 * NOTE: do not write this directly, use ir3_instr_set_address()
330 */
331 struct ir3_instruction *address;
332
333 /* Tracking for additional dependent instructions. Used to handle
334 * barriers, WAR hazards for arrays/SSBOs/etc.
335 */
336 DECLARE_ARRAY(struct ir3_instruction *, deps);
337
338 /*
339 * From PoV of instruction scheduling, not execution (ie. ignores global/
340 * local distinction):
341 * shared image atomic SSBO everything
342 * barrier()/ - R/W R/W R/W R/W X
343 * groupMemoryBarrier()
344 * memoryBarrier() - R/W R/W
345 * (but only images declared coherent?)
346 * memoryBarrierAtomic() - R/W
347 * memoryBarrierBuffer() - R/W
348 * memoryBarrierImage() - R/W
349 * memoryBarrierShared() - R/W
350 *
351 * TODO I think for SSBO/image/shared, in cases where we can determine
352 * which variable is accessed, we don't need to care about accesses to
353 * different variables (unless declared coherent??)
354 */
355 enum {
356 IR3_BARRIER_EVERYTHING = 1 << 0,
357 IR3_BARRIER_SHARED_R = 1 << 1,
358 IR3_BARRIER_SHARED_W = 1 << 2,
359 IR3_BARRIER_IMAGE_R = 1 << 3,
360 IR3_BARRIER_IMAGE_W = 1 << 4,
361 IR3_BARRIER_BUFFER_R = 1 << 5,
362 IR3_BARRIER_BUFFER_W = 1 << 6,
363 IR3_BARRIER_ARRAY_R = 1 << 7,
364 IR3_BARRIER_ARRAY_W = 1 << 8,
365 } barrier_class, barrier_conflict;
366
367 /* Entry in ir3_block's instruction list: */
368 struct list_head node;
369
370 #ifdef DEBUG
371 uint32_t serialno;
372 #endif
373 };
374
375 static inline struct ir3_instruction *
376 ir3_neighbor_first(struct ir3_instruction *instr)
377 {
378 int cnt = 0;
379 while (instr->cp.left) {
380 instr = instr->cp.left;
381 if (++cnt > 0xffff) {
382 debug_assert(0);
383 break;
384 }
385 }
386 return instr;
387 }
388
389 static inline int ir3_neighbor_count(struct ir3_instruction *instr)
390 {
391 int num = 1;
392
393 debug_assert(!instr->cp.left);
394
395 while (instr->cp.right) {
396 num++;
397 instr = instr->cp.right;
398 if (num > 0xffff) {
399 debug_assert(0);
400 break;
401 }
402 }
403
404 return num;
405 }
406
407 struct ir3 {
408 struct ir3_compiler *compiler;
409
410 unsigned ninputs, noutputs;
411 struct ir3_instruction **inputs;
412 struct ir3_instruction **outputs;
413
414 /* Track bary.f (and ldlv) instructions.. this is needed in
415 * scheduling to ensure that all varying fetches happen before
416 * any potential kill instructions. The hw gets grumpy if all
417 * threads in a group are killed before the last bary.f gets
418 * a chance to signal end of input (ei).
419 */
420 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
421
422 /* Track all indirect instructions (read and write). To avoid
423 * deadlock scenario where an address register gets scheduled,
424 * but other dependent src instructions cannot be scheduled due
425 * to dependency on a *different* address register value, the
426 * scheduler needs to ensure that all dependencies other than
427 * the instruction other than the address register are scheduled
428 * before the one that writes the address register. Having a
429 * convenient list of instructions that reference some address
430 * register simplifies this.
431 */
432 DECLARE_ARRAY(struct ir3_instruction *, indirects);
433
434 /* and same for instructions that consume predicate register: */
435 DECLARE_ARRAY(struct ir3_instruction *, predicates);
436
437 /* Track texture sample instructions which need texture state
438 * patched in (for astc-srgb workaround):
439 */
440 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
441
442 /* List of blocks: */
443 struct list_head block_list;
444
445 /* List of ir3_array's: */
446 struct list_head array_list;
447
448 unsigned max_sun; /* max Sethi–Ullman number */
449
450 #ifdef DEBUG
451 unsigned block_count, instr_count;
452 #endif
453 };
454
455 struct ir3_array {
456 struct list_head node;
457 unsigned length;
458 unsigned id;
459
460 struct nir_register *r;
461
462 /* To avoid array write's from getting DCE'd, keep track of the
463 * most recent write. Any array access depends on the most
464 * recent write. This way, nothing depends on writes after the
465 * last read. But all the writes that happen before that have
466 * something depending on them
467 */
468 struct ir3_instruction *last_write;
469
470 /* extra stuff used in RA pass: */
471 unsigned base; /* base vreg name */
472 unsigned reg; /* base physical reg */
473 uint16_t start_ip, end_ip;
474 };
475
476 struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
477
478 struct ir3_block {
479 struct list_head node;
480 struct ir3 *shader;
481
482 const struct nir_block *nblock;
483
484 struct list_head instr_list; /* list of ir3_instruction */
485
486 /* each block has either one or two successors.. in case of
487 * two successors, 'condition' decides which one to follow.
488 * A block preceding an if/else has two successors.
489 */
490 struct ir3_instruction *condition;
491 struct ir3_block *successors[2];
492
493 unsigned predecessors_count;
494 struct ir3_block **predecessors;
495
496 uint16_t start_ip, end_ip;
497
498 /* Track instructions which do not write a register but other-
499 * wise must not be discarded (such as kill, stg, etc)
500 */
501 DECLARE_ARRAY(struct ir3_instruction *, keeps);
502
503 /* used for per-pass extra block data. Mainly used right
504 * now in RA step to track livein/liveout.
505 */
506 void *data;
507
508 #ifdef DEBUG
509 uint32_t serialno;
510 #endif
511 };
512
513 static inline uint32_t
514 block_id(struct ir3_block *block)
515 {
516 #ifdef DEBUG
517 return block->serialno;
518 #else
519 return (uint32_t)(unsigned long)block;
520 #endif
521 }
522
523 struct ir3 * ir3_create(struct ir3_compiler *compiler,
524 unsigned nin, unsigned nout);
525 void ir3_destroy(struct ir3 *shader);
526 void * ir3_assemble(struct ir3 *shader,
527 struct ir3_info *info, uint32_t gpu_id);
528 void * ir3_alloc(struct ir3 *shader, int sz);
529
530 struct ir3_block * ir3_block_create(struct ir3 *shader);
531
532 struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
533 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
534 opc_t opc, int nreg);
535 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
536 void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
537 const char *ir3_instr_name(struct ir3_instruction *instr);
538
539 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
540 int num, int flags);
541 struct ir3_register * ir3_reg_clone(struct ir3 *shader,
542 struct ir3_register *reg);
543
544 void ir3_instr_set_address(struct ir3_instruction *instr,
545 struct ir3_instruction *addr);
546
547 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
548 {
549 if (instr->flags & IR3_INSTR_MARK)
550 return true; /* already visited */
551 instr->flags |= IR3_INSTR_MARK;
552 return false;
553 }
554
555 void ir3_block_clear_mark(struct ir3_block *block);
556 void ir3_clear_mark(struct ir3 *shader);
557
558 unsigned ir3_count_instructions(struct ir3 *ir);
559
560 static inline int ir3_instr_regno(struct ir3_instruction *instr,
561 struct ir3_register *reg)
562 {
563 unsigned i;
564 for (i = 0; i < instr->regs_count; i++)
565 if (reg == instr->regs[i])
566 return i;
567 return -1;
568 }
569
570
571 #define MAX_ARRAYS 16
572
573 /* comp:
574 * 0 - x
575 * 1 - y
576 * 2 - z
577 * 3 - w
578 */
579 static inline uint32_t regid(int num, int comp)
580 {
581 return (num << 2) | (comp & 0x3);
582 }
583
584 static inline uint32_t reg_num(struct ir3_register *reg)
585 {
586 return reg->num >> 2;
587 }
588
589 static inline uint32_t reg_comp(struct ir3_register *reg)
590 {
591 return reg->num & 0x3;
592 }
593
594 static inline bool is_flow(struct ir3_instruction *instr)
595 {
596 return (opc_cat(instr->opc) == 0);
597 }
598
599 static inline bool is_kill(struct ir3_instruction *instr)
600 {
601 return instr->opc == OPC_KILL;
602 }
603
604 static inline bool is_nop(struct ir3_instruction *instr)
605 {
606 return instr->opc == OPC_NOP;
607 }
608
609 /* Is it a non-transformative (ie. not type changing) mov? This can
610 * also include absneg.s/absneg.f, which for the most part can be
611 * treated as a mov (single src argument).
612 */
613 static inline bool is_same_type_mov(struct ir3_instruction *instr)
614 {
615 struct ir3_register *dst;
616
617 switch (instr->opc) {
618 case OPC_MOV:
619 if (instr->cat1.src_type != instr->cat1.dst_type)
620 return false;
621 break;
622 case OPC_ABSNEG_F:
623 case OPC_ABSNEG_S:
624 if (instr->flags & IR3_INSTR_SAT)
625 return false;
626 break;
627 default:
628 return false;
629 }
630
631 dst = instr->regs[0];
632
633 /* mov's that write to a0.x or p0.x are special: */
634 if (dst->num == regid(REG_P0, 0))
635 return false;
636 if (dst->num == regid(REG_A0, 0))
637 return false;
638
639 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
640 return false;
641
642 return true;
643 }
644
645 static inline bool is_alu(struct ir3_instruction *instr)
646 {
647 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
648 }
649
650 static inline bool is_sfu(struct ir3_instruction *instr)
651 {
652 return (opc_cat(instr->opc) == 4);
653 }
654
655 static inline bool is_tex(struct ir3_instruction *instr)
656 {
657 return (opc_cat(instr->opc) == 5);
658 }
659
660 static inline bool is_mem(struct ir3_instruction *instr)
661 {
662 return (opc_cat(instr->opc) == 6);
663 }
664
665 static inline bool is_barrier(struct ir3_instruction *instr)
666 {
667 return (opc_cat(instr->opc) == 7);
668 }
669
670 static inline bool
671 is_store(struct ir3_instruction *instr)
672 {
673 /* these instructions, the "destination" register is
674 * actually a source, the address to store to.
675 */
676 switch (instr->opc) {
677 case OPC_STG:
678 case OPC_STGB:
679 case OPC_STIB:
680 case OPC_STP:
681 case OPC_STL:
682 case OPC_STLW:
683 case OPC_L2G:
684 case OPC_G2L:
685 return true;
686 default:
687 return false;
688 }
689 }
690
691 static inline bool is_load(struct ir3_instruction *instr)
692 {
693 switch (instr->opc) {
694 case OPC_LDG:
695 case OPC_LDGB:
696 case OPC_LDIB:
697 case OPC_LDL:
698 case OPC_LDP:
699 case OPC_L2G:
700 case OPC_LDLW:
701 case OPC_LDC:
702 case OPC_LDLV:
703 /* probably some others too.. */
704 return true;
705 default:
706 return false;
707 }
708 }
709
710 static inline bool is_input(struct ir3_instruction *instr)
711 {
712 /* in some cases, ldlv is used to fetch varying without
713 * interpolation.. fortunately inloc is the first src
714 * register in either case
715 */
716 switch (instr->opc) {
717 case OPC_LDLV:
718 case OPC_BARY_F:
719 return true;
720 default:
721 return false;
722 }
723 }
724
725 static inline bool is_bool(struct ir3_instruction *instr)
726 {
727 switch (instr->opc) {
728 case OPC_CMPS_F:
729 case OPC_CMPS_S:
730 case OPC_CMPS_U:
731 return true;
732 default:
733 return false;
734 }
735 }
736
737 static inline bool is_meta(struct ir3_instruction *instr)
738 {
739 /* TODO how should we count PHI (and maybe fan-in/out) which
740 * might actually contribute some instructions to the final
741 * result?
742 */
743 return (opc_cat(instr->opc) == -1);
744 }
745
746 static inline unsigned dest_regs(struct ir3_instruction *instr)
747 {
748 if ((instr->regs_count == 0) || is_store(instr))
749 return 0;
750
751 return util_last_bit(instr->regs[0]->wrmask);
752 }
753
754 static inline bool writes_addr(struct ir3_instruction *instr)
755 {
756 if (instr->regs_count > 0) {
757 struct ir3_register *dst = instr->regs[0];
758 return reg_num(dst) == REG_A0;
759 }
760 return false;
761 }
762
763 static inline bool writes_pred(struct ir3_instruction *instr)
764 {
765 if (instr->regs_count > 0) {
766 struct ir3_register *dst = instr->regs[0];
767 return reg_num(dst) == REG_P0;
768 }
769 return false;
770 }
771
772 /* returns defining instruction for reg */
773 /* TODO better name */
774 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
775 {
776 if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
777 return reg->instr;
778 }
779 return NULL;
780 }
781
782 static inline bool conflicts(struct ir3_instruction *a,
783 struct ir3_instruction *b)
784 {
785 return (a && b) && (a != b);
786 }
787
788 static inline bool reg_gpr(struct ir3_register *r)
789 {
790 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
791 return false;
792 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
793 return false;
794 return true;
795 }
796
797 static inline type_t half_type(type_t type)
798 {
799 switch (type) {
800 case TYPE_F32: return TYPE_F16;
801 case TYPE_U32: return TYPE_U16;
802 case TYPE_S32: return TYPE_S16;
803 case TYPE_F16:
804 case TYPE_U16:
805 case TYPE_S16:
806 return type;
807 default:
808 assert(0);
809 return ~0;
810 }
811 }
812
813 /* some cat2 instructions (ie. those which are not float) can embed an
814 * immediate:
815 */
816 static inline bool ir3_cat2_int(opc_t opc)
817 {
818 switch (opc) {
819 case OPC_ADD_U:
820 case OPC_ADD_S:
821 case OPC_SUB_U:
822 case OPC_SUB_S:
823 case OPC_CMPS_U:
824 case OPC_CMPS_S:
825 case OPC_MIN_U:
826 case OPC_MIN_S:
827 case OPC_MAX_U:
828 case OPC_MAX_S:
829 case OPC_CMPV_U:
830 case OPC_CMPV_S:
831 case OPC_MUL_U:
832 case OPC_MUL_S:
833 case OPC_MULL_U:
834 case OPC_CLZ_S:
835 case OPC_ABSNEG_S:
836 case OPC_AND_B:
837 case OPC_OR_B:
838 case OPC_NOT_B:
839 case OPC_XOR_B:
840 case OPC_BFREV_B:
841 case OPC_CLZ_B:
842 case OPC_SHL_B:
843 case OPC_SHR_B:
844 case OPC_ASHR_B:
845 case OPC_MGEN_B:
846 case OPC_GETBIT_B:
847 case OPC_CBITS_B:
848 case OPC_BARY_F:
849 return true;
850
851 default:
852 return false;
853 }
854 }
855
856
857 /* map cat2 instruction to valid abs/neg flags: */
858 static inline unsigned ir3_cat2_absneg(opc_t opc)
859 {
860 switch (opc) {
861 case OPC_ADD_F:
862 case OPC_MIN_F:
863 case OPC_MAX_F:
864 case OPC_MUL_F:
865 case OPC_SIGN_F:
866 case OPC_CMPS_F:
867 case OPC_ABSNEG_F:
868 case OPC_CMPV_F:
869 case OPC_FLOOR_F:
870 case OPC_CEIL_F:
871 case OPC_RNDNE_F:
872 case OPC_RNDAZ_F:
873 case OPC_TRUNC_F:
874 case OPC_BARY_F:
875 return IR3_REG_FABS | IR3_REG_FNEG;
876
877 case OPC_ADD_U:
878 case OPC_ADD_S:
879 case OPC_SUB_U:
880 case OPC_SUB_S:
881 case OPC_CMPS_U:
882 case OPC_CMPS_S:
883 case OPC_MIN_U:
884 case OPC_MIN_S:
885 case OPC_MAX_U:
886 case OPC_MAX_S:
887 case OPC_CMPV_U:
888 case OPC_CMPV_S:
889 case OPC_MUL_U:
890 case OPC_MUL_S:
891 case OPC_MULL_U:
892 case OPC_CLZ_S:
893 return 0;
894
895 case OPC_ABSNEG_S:
896 return IR3_REG_SABS | IR3_REG_SNEG;
897
898 case OPC_AND_B:
899 case OPC_OR_B:
900 case OPC_NOT_B:
901 case OPC_XOR_B:
902 case OPC_BFREV_B:
903 case OPC_CLZ_B:
904 case OPC_SHL_B:
905 case OPC_SHR_B:
906 case OPC_ASHR_B:
907 case OPC_MGEN_B:
908 case OPC_GETBIT_B:
909 case OPC_CBITS_B:
910 return IR3_REG_BNOT;
911
912 default:
913 return 0;
914 }
915 }
916
917 /* map cat3 instructions to valid abs/neg flags: */
918 static inline unsigned ir3_cat3_absneg(opc_t opc)
919 {
920 switch (opc) {
921 case OPC_MAD_F16:
922 case OPC_MAD_F32:
923 case OPC_SEL_F16:
924 case OPC_SEL_F32:
925 return IR3_REG_FNEG;
926
927 case OPC_MAD_U16:
928 case OPC_MADSH_U16:
929 case OPC_MAD_S16:
930 case OPC_MADSH_M16:
931 case OPC_MAD_U24:
932 case OPC_MAD_S24:
933 case OPC_SEL_S16:
934 case OPC_SEL_S32:
935 case OPC_SAD_S16:
936 case OPC_SAD_S32:
937 /* neg *may* work on 3rd src.. */
938
939 case OPC_SEL_B16:
940 case OPC_SEL_B32:
941
942 default:
943 return 0;
944 }
945 }
946
947 #define MASK(n) ((1 << (n)) - 1)
948
949 /* iterator for an instructions's sources (reg), also returns src #: */
950 #define foreach_src_n(__srcreg, __n, __instr) \
951 if ((__instr)->regs_count) \
952 for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
953 if ((__srcreg = (__instr)->regs[__n + 1]))
954
955 /* iterator for an instructions's sources (reg): */
956 #define foreach_src(__srcreg, __instr) \
957 foreach_src_n(__srcreg, __i, __instr)
958
959 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
960 {
961 unsigned cnt = instr->regs_count + instr->deps_count;
962 if (instr->address)
963 cnt++;
964 return cnt;
965 }
966
967 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
968 {
969 if (n == (instr->regs_count + instr->deps_count))
970 return instr->address;
971 if (n >= instr->regs_count)
972 return instr->deps[n - instr->regs_count];
973 return ssa(instr->regs[n]);
974 }
975
976 static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
977 {
978 if (n == (instr->regs_count + instr->deps_count))
979 return false;
980 if (n >= instr->regs_count)
981 return true;
982 return false;
983 }
984
985 #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
986
987 /* iterator for an instruction's SSA sources (instr), also returns src #: */
988 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
989 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
990 if ((__srcinst = __ssa_src_n(__instr, __n)))
991
992 /* iterator for an instruction's SSA sources (instr): */
993 #define foreach_ssa_src(__srcinst, __instr) \
994 foreach_ssa_src_n(__srcinst, __i, __instr)
995
996
997 /* dump: */
998 void ir3_print(struct ir3 *ir);
999 void ir3_print_instr(struct ir3_instruction *instr);
1000
1001 /* depth calculation: */
1002 int ir3_delayslots(struct ir3_instruction *assigner,
1003 struct ir3_instruction *consumer, unsigned n);
1004 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
1005 void ir3_depth(struct ir3 *ir);
1006
1007 /* copy-propagate: */
1008 struct ir3_shader_variant;
1009 void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
1010
1011 /* group neighbors and insert mov's to resolve conflicts: */
1012 void ir3_group(struct ir3 *ir);
1013
1014 /* Sethi–Ullman numbering: */
1015 void ir3_sun(struct ir3 *ir);
1016
1017 /* scheduling: */
1018 void ir3_sched_add_deps(struct ir3 *ir);
1019 int ir3_sched(struct ir3 *ir);
1020
1021 void ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
1022
1023 /* register assignment: */
1024 struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler);
1025 int ir3_ra(struct ir3 *ir3, gl_shader_stage type,
1026 bool frag_coord, bool frag_face);
1027
1028 /* legalize: */
1029 void ir3_legalize(struct ir3 *ir, int *num_samp, bool *has_ssbo, int *max_bary);
1030
1031 /* ************************************************************************* */
1032 /* instruction helpers */
1033
1034 static inline struct ir3_instruction *
1035 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
1036 {
1037 struct ir3_instruction *mov;
1038 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0;
1039
1040 mov = ir3_instr_create(block, OPC_MOV);
1041 mov->cat1.src_type = type;
1042 mov->cat1.dst_type = type;
1043 ir3_reg_create(mov, 0, flags);
1044 ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
1045
1046 return mov;
1047 }
1048
1049 static inline struct ir3_instruction *
1050 create_immed(struct ir3_block *block, uint32_t val)
1051 {
1052 return create_immed_typed(block, val, TYPE_U32);
1053 }
1054
1055 static inline struct ir3_instruction *
1056 create_uniform(struct ir3_block *block, unsigned n)
1057 {
1058 struct ir3_instruction *mov;
1059
1060 mov = ir3_instr_create(block, OPC_MOV);
1061 /* TODO get types right? */
1062 mov->cat1.src_type = TYPE_F32;
1063 mov->cat1.dst_type = TYPE_F32;
1064 ir3_reg_create(mov, 0, 0);
1065 ir3_reg_create(mov, n, IR3_REG_CONST);
1066
1067 return mov;
1068 }
1069
1070 static inline struct ir3_instruction *
1071 create_uniform_indirect(struct ir3_block *block, int n,
1072 struct ir3_instruction *address)
1073 {
1074 struct ir3_instruction *mov;
1075
1076 mov = ir3_instr_create(block, OPC_MOV);
1077 mov->cat1.src_type = TYPE_U32;
1078 mov->cat1.dst_type = TYPE_U32;
1079 ir3_reg_create(mov, 0, 0);
1080 ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
1081
1082 ir3_instr_set_address(mov, address);
1083
1084 return mov;
1085 }
1086
1087 /* creates SSA src of correct type (ie. half vs full precision) */
1088 static inline struct ir3_register * __ssa_src(struct ir3_instruction *instr,
1089 struct ir3_instruction *src, unsigned flags)
1090 {
1091 struct ir3_register *reg;
1092 if (src->regs[0]->flags & IR3_REG_HALF)
1093 flags |= IR3_REG_HALF;
1094 reg = ir3_reg_create(instr, 0, IR3_REG_SSA | flags);
1095 reg->instr = src;
1096 reg->wrmask = src->regs[0]->wrmask;
1097 return reg;
1098 }
1099
1100 static inline struct ir3_instruction *
1101 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
1102 {
1103 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1104 ir3_reg_create(instr, 0, 0); /* dst */
1105 if (src->regs[0]->flags & IR3_REG_ARRAY) {
1106 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
1107 src_reg->array = src->regs[0]->array;
1108 } else {
1109 __ssa_src(instr, src, src->regs[0]->flags & IR3_REG_HIGH);
1110 }
1111 debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
1112 instr->cat1.src_type = type;
1113 instr->cat1.dst_type = type;
1114 return instr;
1115 }
1116
1117 static inline struct ir3_instruction *
1118 ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
1119 type_t src_type, type_t dst_type)
1120 {
1121 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
1122 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0;
1123 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0;
1124
1125 debug_assert((src->regs[0]->flags & IR3_REG_HALF) == src_flags);
1126
1127 ir3_reg_create(instr, 0, dst_flags); /* dst */
1128 __ssa_src(instr, src, 0);
1129 instr->cat1.src_type = src_type;
1130 instr->cat1.dst_type = dst_type;
1131 debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
1132 return instr;
1133 }
1134
1135 static inline struct ir3_instruction *
1136 ir3_NOP(struct ir3_block *block)
1137 {
1138 return ir3_instr_create(block, OPC_NOP);
1139 }
1140
1141 #define INSTR0(name) \
1142 static inline struct ir3_instruction * \
1143 ir3_##name(struct ir3_block *block) \
1144 { \
1145 struct ir3_instruction *instr = \
1146 ir3_instr_create(block, OPC_##name); \
1147 return instr; \
1148 }
1149
1150 #define INSTR1(name) \
1151 static inline struct ir3_instruction * \
1152 ir3_##name(struct ir3_block *block, \
1153 struct ir3_instruction *a, unsigned aflags) \
1154 { \
1155 struct ir3_instruction *instr = \
1156 ir3_instr_create(block, OPC_##name); \
1157 ir3_reg_create(instr, 0, 0); /* dst */ \
1158 __ssa_src(instr, a, aflags); \
1159 return instr; \
1160 }
1161
1162 #define INSTR2(name) \
1163 static inline struct ir3_instruction * \
1164 ir3_##name(struct ir3_block *block, \
1165 struct ir3_instruction *a, unsigned aflags, \
1166 struct ir3_instruction *b, unsigned bflags) \
1167 { \
1168 struct ir3_instruction *instr = \
1169 ir3_instr_create(block, OPC_##name); \
1170 ir3_reg_create(instr, 0, 0); /* dst */ \
1171 __ssa_src(instr, a, aflags); \
1172 __ssa_src(instr, b, bflags); \
1173 return instr; \
1174 }
1175
1176 #define INSTR3(name) \
1177 static inline struct ir3_instruction * \
1178 ir3_##name(struct ir3_block *block, \
1179 struct ir3_instruction *a, unsigned aflags, \
1180 struct ir3_instruction *b, unsigned bflags, \
1181 struct ir3_instruction *c, unsigned cflags) \
1182 { \
1183 struct ir3_instruction *instr = \
1184 ir3_instr_create(block, OPC_##name); \
1185 ir3_reg_create(instr, 0, 0); /* dst */ \
1186 __ssa_src(instr, a, aflags); \
1187 __ssa_src(instr, b, bflags); \
1188 __ssa_src(instr, c, cflags); \
1189 return instr; \
1190 }
1191
1192 #define INSTR3F(f, name) \
1193 static inline struct ir3_instruction * \
1194 ir3_##name##_##f(struct ir3_block *block, \
1195 struct ir3_instruction *a, unsigned aflags, \
1196 struct ir3_instruction *b, unsigned bflags, \
1197 struct ir3_instruction *c, unsigned cflags) \
1198 { \
1199 struct ir3_instruction *instr = \
1200 ir3_instr_create2(block, OPC_##name, 5); \
1201 ir3_reg_create(instr, 0, 0); /* dst */ \
1202 __ssa_src(instr, a, aflags); \
1203 __ssa_src(instr, b, bflags); \
1204 __ssa_src(instr, c, cflags); \
1205 instr->flags |= IR3_INSTR_##f; \
1206 return instr; \
1207 }
1208
1209 #define INSTR4(name) \
1210 static inline struct ir3_instruction * \
1211 ir3_##name(struct ir3_block *block, \
1212 struct ir3_instruction *a, unsigned aflags, \
1213 struct ir3_instruction *b, unsigned bflags, \
1214 struct ir3_instruction *c, unsigned cflags, \
1215 struct ir3_instruction *d, unsigned dflags) \
1216 { \
1217 struct ir3_instruction *instr = \
1218 ir3_instr_create2(block, OPC_##name, 5); \
1219 ir3_reg_create(instr, 0, 0); /* dst */ \
1220 __ssa_src(instr, a, aflags); \
1221 __ssa_src(instr, b, bflags); \
1222 __ssa_src(instr, c, cflags); \
1223 __ssa_src(instr, d, dflags); \
1224 return instr; \
1225 }
1226
1227 #define INSTR4F(f, name) \
1228 static inline struct ir3_instruction * \
1229 ir3_##name##_##f(struct ir3_block *block, \
1230 struct ir3_instruction *a, unsigned aflags, \
1231 struct ir3_instruction *b, unsigned bflags, \
1232 struct ir3_instruction *c, unsigned cflags, \
1233 struct ir3_instruction *d, unsigned dflags) \
1234 { \
1235 struct ir3_instruction *instr = \
1236 ir3_instr_create2(block, OPC_##name, 5); \
1237 ir3_reg_create(instr, 0, 0); /* dst */ \
1238 __ssa_src(instr, a, aflags); \
1239 __ssa_src(instr, b, bflags); \
1240 __ssa_src(instr, c, cflags); \
1241 __ssa_src(instr, d, dflags); \
1242 instr->flags |= IR3_INSTR_##f; \
1243 return instr; \
1244 }
1245
1246 /* cat0 instructions: */
1247 INSTR0(BR)
1248 INSTR0(JUMP)
1249 INSTR1(KILL)
1250 INSTR0(END)
1251
1252 /* cat2 instructions, most 2 src but some 1 src: */
1253 INSTR2(ADD_F)
1254 INSTR2(MIN_F)
1255 INSTR2(MAX_F)
1256 INSTR2(MUL_F)
1257 INSTR1(SIGN_F)
1258 INSTR2(CMPS_F)
1259 INSTR1(ABSNEG_F)
1260 INSTR2(CMPV_F)
1261 INSTR1(FLOOR_F)
1262 INSTR1(CEIL_F)
1263 INSTR1(RNDNE_F)
1264 INSTR1(RNDAZ_F)
1265 INSTR1(TRUNC_F)
1266 INSTR2(ADD_U)
1267 INSTR2(ADD_S)
1268 INSTR2(SUB_U)
1269 INSTR2(SUB_S)
1270 INSTR2(CMPS_U)
1271 INSTR2(CMPS_S)
1272 INSTR2(MIN_U)
1273 INSTR2(MIN_S)
1274 INSTR2(MAX_U)
1275 INSTR2(MAX_S)
1276 INSTR1(ABSNEG_S)
1277 INSTR2(AND_B)
1278 INSTR2(OR_B)
1279 INSTR1(NOT_B)
1280 INSTR2(XOR_B)
1281 INSTR2(CMPV_U)
1282 INSTR2(CMPV_S)
1283 INSTR2(MUL_U)
1284 INSTR2(MUL_S)
1285 INSTR2(MULL_U)
1286 INSTR1(BFREV_B)
1287 INSTR1(CLZ_S)
1288 INSTR1(CLZ_B)
1289 INSTR2(SHL_B)
1290 INSTR2(SHR_B)
1291 INSTR2(ASHR_B)
1292 INSTR2(BARY_F)
1293 INSTR2(MGEN_B)
1294 INSTR2(GETBIT_B)
1295 INSTR1(SETRM)
1296 INSTR1(CBITS_B)
1297 INSTR2(SHB)
1298 INSTR2(MSAD)
1299
1300 /* cat3 instructions: */
1301 INSTR3(MAD_U16)
1302 INSTR3(MADSH_U16)
1303 INSTR3(MAD_S16)
1304 INSTR3(MADSH_M16)
1305 INSTR3(MAD_U24)
1306 INSTR3(MAD_S24)
1307 INSTR3(MAD_F16)
1308 INSTR3(MAD_F32)
1309 INSTR3(SEL_B16)
1310 INSTR3(SEL_B32)
1311 INSTR3(SEL_S16)
1312 INSTR3(SEL_S32)
1313 INSTR3(SEL_F16)
1314 INSTR3(SEL_F32)
1315 INSTR3(SAD_S16)
1316 INSTR3(SAD_S32)
1317
1318 /* cat4 instructions: */
1319 INSTR1(RCP)
1320 INSTR1(RSQ)
1321 INSTR1(LOG2)
1322 INSTR1(EXP2)
1323 INSTR1(SIN)
1324 INSTR1(COS)
1325 INSTR1(SQRT)
1326
1327 /* cat5 instructions: */
1328 INSTR1(DSX)
1329 INSTR1(DSY)
1330
1331 static inline struct ir3_instruction *
1332 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
1333 unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
1334 struct ir3_instruction *src0, struct ir3_instruction *src1)
1335 {
1336 struct ir3_instruction *sam;
1337 struct ir3_register *reg;
1338
1339 sam = ir3_instr_create(block, opc);
1340 sam->flags |= flags;
1341 ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
1342 // temporary step, extra dummy src which will become the
1343 // hvec2(samp, tex) argument:
1344 ir3_reg_create(sam, 0, 0);
1345 if (src0) {
1346 reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
1347 reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
1348 reg->instr = src0;
1349 }
1350 if (src1) {
1351 reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
1352 reg->instr = src1;
1353 reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
1354 }
1355 sam->cat5.samp = samp;
1356 sam->cat5.tex = tex;
1357 sam->cat5.type = type;
1358
1359 return sam;
1360 }
1361
1362 /* cat6 instructions: */
1363 INSTR2(LDLV)
1364 INSTR2(LDG)
1365 INSTR2(LDL)
1366 INSTR3(STG)
1367 INSTR3(STL)
1368 INSTR1(RESINFO)
1369 INSTR1(RESFMT)
1370 INSTR2(ATOMIC_ADD)
1371 INSTR2(ATOMIC_SUB)
1372 INSTR2(ATOMIC_XCHG)
1373 INSTR2(ATOMIC_INC)
1374 INSTR2(ATOMIC_DEC)
1375 INSTR2(ATOMIC_CMPXCHG)
1376 INSTR2(ATOMIC_MIN)
1377 INSTR2(ATOMIC_MAX)
1378 INSTR2(ATOMIC_AND)
1379 INSTR2(ATOMIC_OR)
1380 INSTR2(ATOMIC_XOR)
1381 #if GPU >= 600
1382 INSTR3(STIB);
1383 INSTR2(LDIB);
1384 INSTR3F(G, ATOMIC_ADD)
1385 INSTR3F(G, ATOMIC_SUB)
1386 INSTR3F(G, ATOMIC_XCHG)
1387 INSTR3F(G, ATOMIC_INC)
1388 INSTR3F(G, ATOMIC_DEC)
1389 INSTR3F(G, ATOMIC_CMPXCHG)
1390 INSTR3F(G, ATOMIC_MIN)
1391 INSTR3F(G, ATOMIC_MAX)
1392 INSTR3F(G, ATOMIC_AND)
1393 INSTR3F(G, ATOMIC_OR)
1394 INSTR3F(G, ATOMIC_XOR)
1395 #elif GPU >= 400
1396 INSTR3(LDGB)
1397 INSTR4(STGB)
1398 INSTR4(STIB)
1399 INSTR4F(G, ATOMIC_ADD)
1400 INSTR4F(G, ATOMIC_SUB)
1401 INSTR4F(G, ATOMIC_XCHG)
1402 INSTR4F(G, ATOMIC_INC)
1403 INSTR4F(G, ATOMIC_DEC)
1404 INSTR4F(G, ATOMIC_CMPXCHG)
1405 INSTR4F(G, ATOMIC_MIN)
1406 INSTR4F(G, ATOMIC_MAX)
1407 INSTR4F(G, ATOMIC_AND)
1408 INSTR4F(G, ATOMIC_OR)
1409 INSTR4F(G, ATOMIC_XOR)
1410 #endif
1411
1412 /* cat7 instructions: */
1413 INSTR0(BAR)
1414 INSTR0(FENCE)
1415
1416 /* ************************************************************************* */
1417 /* split this out or find some helper to use.. like main/bitset.h.. */
1418
1419 #include <string.h>
1420
1421 #define MAX_REG 256
1422
1423 typedef uint8_t regmask_t[2 * MAX_REG / 8];
1424
1425 static inline unsigned regmask_idx(struct ir3_register *reg)
1426 {
1427 unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
1428 debug_assert(num < MAX_REG);
1429 if (reg->flags & IR3_REG_HALF)
1430 num += MAX_REG;
1431 return num;
1432 }
1433
1434 static inline void regmask_init(regmask_t *regmask)
1435 {
1436 memset(regmask, 0, sizeof(*regmask));
1437 }
1438
1439 static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
1440 {
1441 unsigned idx = regmask_idx(reg);
1442 if (reg->flags & IR3_REG_RELATIV) {
1443 unsigned i;
1444 for (i = 0; i < reg->size; i++, idx++)
1445 (*regmask)[idx / 8] |= 1 << (idx % 8);
1446 } else {
1447 unsigned mask;
1448 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1449 if (mask & 1)
1450 (*regmask)[idx / 8] |= 1 << (idx % 8);
1451 }
1452 }
1453
1454 static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
1455 {
1456 unsigned i;
1457 for (i = 0; i < ARRAY_SIZE(*dst); i++)
1458 (*dst)[i] = (*a)[i] | (*b)[i];
1459 }
1460
1461 /* set bits in a if not set in b, conceptually:
1462 * a |= (reg & ~b)
1463 */
1464 static inline void regmask_set_if_not(regmask_t *a,
1465 struct ir3_register *reg, regmask_t *b)
1466 {
1467 unsigned idx = regmask_idx(reg);
1468 if (reg->flags & IR3_REG_RELATIV) {
1469 unsigned i;
1470 for (i = 0; i < reg->size; i++, idx++)
1471 if (!((*b)[idx / 8] & (1 << (idx % 8))))
1472 (*a)[idx / 8] |= 1 << (idx % 8);
1473 } else {
1474 unsigned mask;
1475 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1476 if (mask & 1)
1477 if (!((*b)[idx / 8] & (1 << (idx % 8))))
1478 (*a)[idx / 8] |= 1 << (idx % 8);
1479 }
1480 }
1481
1482 static inline bool regmask_get(regmask_t *regmask,
1483 struct ir3_register *reg)
1484 {
1485 unsigned idx = regmask_idx(reg);
1486 if (reg->flags & IR3_REG_RELATIV) {
1487 unsigned i;
1488 for (i = 0; i < reg->size; i++, idx++)
1489 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
1490 return true;
1491 } else {
1492 unsigned mask;
1493 for (mask = reg->wrmask; mask; mask >>= 1, idx++)
1494 if (mask & 1)
1495 if ((*regmask)[idx / 8] & (1 << (idx % 8)))
1496 return true;
1497 }
1498 return false;
1499 }
1500
1501 /* ************************************************************************* */
1502
1503 #endif /* IR3_H_ */