freedreno/ir3: Add support for disasm of cat2 float32 immediates.
[mesa.git] / src / freedreno / ir3 / instr-a3xx.h
1 /*
2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef INSTR_A3XX_H_
25 #define INSTR_A3XX_H_
26
27 #define PACKED __attribute__((__packed__))
28
29 #include <stdint.h>
30 #include <stdio.h>
31 #include <stdbool.h>
32 #include <assert.h>
33
34 /* size of largest OPC field of all the instruction categories: */
35 #define NOPC_BITS 6
36
37 #define _OPC(cat, opc) (((cat) << NOPC_BITS) | opc)
38
39 typedef enum {
40 /* category 0: */
41 OPC_NOP = _OPC(0, 0),
42 OPC_BR = _OPC(0, 1),
43 OPC_JUMP = _OPC(0, 2),
44 OPC_CALL = _OPC(0, 3),
45 OPC_RET = _OPC(0, 4),
46 OPC_KILL = _OPC(0, 5),
47 OPC_END = _OPC(0, 6),
48 OPC_EMIT = _OPC(0, 7),
49 OPC_CUT = _OPC(0, 8),
50 OPC_CHMASK = _OPC(0, 9),
51 OPC_CHSH = _OPC(0, 10),
52 OPC_FLOW_REV = _OPC(0, 11),
53
54 OPC_IF = _OPC(0, 13),
55 OPC_ELSE = _OPC(0, 14),
56 OPC_ENDIF = _OPC(0, 15),
57
58 /* category 1: */
59 OPC_MOV = _OPC(1, 0),
60
61 /* category 2: */
62 OPC_ADD_F = _OPC(2, 0),
63 OPC_MIN_F = _OPC(2, 1),
64 OPC_MAX_F = _OPC(2, 2),
65 OPC_MUL_F = _OPC(2, 3),
66 OPC_SIGN_F = _OPC(2, 4),
67 OPC_CMPS_F = _OPC(2, 5),
68 OPC_ABSNEG_F = _OPC(2, 6),
69 OPC_CMPV_F = _OPC(2, 7),
70 /* 8 - invalid */
71 OPC_FLOOR_F = _OPC(2, 9),
72 OPC_CEIL_F = _OPC(2, 10),
73 OPC_RNDNE_F = _OPC(2, 11),
74 OPC_RNDAZ_F = _OPC(2, 12),
75 OPC_TRUNC_F = _OPC(2, 13),
76 /* 14-15 - invalid */
77 OPC_ADD_U = _OPC(2, 16),
78 OPC_ADD_S = _OPC(2, 17),
79 OPC_SUB_U = _OPC(2, 18),
80 OPC_SUB_S = _OPC(2, 19),
81 OPC_CMPS_U = _OPC(2, 20),
82 OPC_CMPS_S = _OPC(2, 21),
83 OPC_MIN_U = _OPC(2, 22),
84 OPC_MIN_S = _OPC(2, 23),
85 OPC_MAX_U = _OPC(2, 24),
86 OPC_MAX_S = _OPC(2, 25),
87 OPC_ABSNEG_S = _OPC(2, 26),
88 /* 27 - invalid */
89 OPC_AND_B = _OPC(2, 28),
90 OPC_OR_B = _OPC(2, 29),
91 OPC_NOT_B = _OPC(2, 30),
92 OPC_XOR_B = _OPC(2, 31),
93 /* 32 - invalid */
94 OPC_CMPV_U = _OPC(2, 33),
95 OPC_CMPV_S = _OPC(2, 34),
96 /* 35-47 - invalid */
97 OPC_MUL_U24 = _OPC(2, 48), /* 24b mul into 32b result */
98 OPC_MUL_S24 = _OPC(2, 49), /* 24b mul into 32b result with sign extension */
99 OPC_MULL_U = _OPC(2, 50),
100 OPC_BFREV_B = _OPC(2, 51),
101 OPC_CLZ_S = _OPC(2, 52),
102 OPC_CLZ_B = _OPC(2, 53),
103 OPC_SHL_B = _OPC(2, 54),
104 OPC_SHR_B = _OPC(2, 55),
105 OPC_ASHR_B = _OPC(2, 56),
106 OPC_BARY_F = _OPC(2, 57),
107 OPC_MGEN_B = _OPC(2, 58),
108 OPC_GETBIT_B = _OPC(2, 59),
109 OPC_SETRM = _OPC(2, 60),
110 OPC_CBITS_B = _OPC(2, 61),
111 OPC_SHB = _OPC(2, 62),
112 OPC_MSAD = _OPC(2, 63),
113
114 /* category 3: */
115 OPC_MAD_U16 = _OPC(3, 0),
116 OPC_MADSH_U16 = _OPC(3, 1),
117 OPC_MAD_S16 = _OPC(3, 2),
118 OPC_MADSH_M16 = _OPC(3, 3), /* should this be .s16? */
119 OPC_MAD_U24 = _OPC(3, 4),
120 OPC_MAD_S24 = _OPC(3, 5),
121 OPC_MAD_F16 = _OPC(3, 6),
122 OPC_MAD_F32 = _OPC(3, 7),
123 OPC_SEL_B16 = _OPC(3, 8),
124 OPC_SEL_B32 = _OPC(3, 9),
125 OPC_SEL_S16 = _OPC(3, 10),
126 OPC_SEL_S32 = _OPC(3, 11),
127 OPC_SEL_F16 = _OPC(3, 12),
128 OPC_SEL_F32 = _OPC(3, 13),
129 OPC_SAD_S16 = _OPC(3, 14),
130 OPC_SAD_S32 = _OPC(3, 15),
131
132 /* category 4: */
133 OPC_RCP = _OPC(4, 0),
134 OPC_RSQ = _OPC(4, 1),
135 OPC_LOG2 = _OPC(4, 2),
136 OPC_EXP2 = _OPC(4, 3),
137 OPC_SIN = _OPC(4, 4),
138 OPC_COS = _OPC(4, 5),
139 OPC_SQRT = _OPC(4, 6),
140 /* NOTE that these are 8+opc from their highp equivs, so it's possible
141 * that the high order bit in the opc field has been repurposed for
142 * half-precision use? But note that other ops (rcp/lsin/cos/sqrt)
143 * still use the same opc as highp
144 */
145 OPC_HRSQ = _OPC(4, 9),
146 OPC_HLOG2 = _OPC(4, 10),
147 OPC_HEXP2 = _OPC(4, 11),
148
149 /* category 5: */
150 OPC_ISAM = _OPC(5, 0),
151 OPC_ISAML = _OPC(5, 1),
152 OPC_ISAMM = _OPC(5, 2),
153 OPC_SAM = _OPC(5, 3),
154 OPC_SAMB = _OPC(5, 4),
155 OPC_SAML = _OPC(5, 5),
156 OPC_SAMGQ = _OPC(5, 6),
157 OPC_GETLOD = _OPC(5, 7),
158 OPC_CONV = _OPC(5, 8),
159 OPC_CONVM = _OPC(5, 9),
160 OPC_GETSIZE = _OPC(5, 10),
161 OPC_GETBUF = _OPC(5, 11),
162 OPC_GETPOS = _OPC(5, 12),
163 OPC_GETINFO = _OPC(5, 13),
164 OPC_DSX = _OPC(5, 14),
165 OPC_DSY = _OPC(5, 15),
166 OPC_GATHER4R = _OPC(5, 16),
167 OPC_GATHER4G = _OPC(5, 17),
168 OPC_GATHER4B = _OPC(5, 18),
169 OPC_GATHER4A = _OPC(5, 19),
170 OPC_SAMGP0 = _OPC(5, 20),
171 OPC_SAMGP1 = _OPC(5, 21),
172 OPC_SAMGP2 = _OPC(5, 22),
173 OPC_SAMGP3 = _OPC(5, 23),
174 OPC_DSXPP_1 = _OPC(5, 24),
175 OPC_DSYPP_1 = _OPC(5, 25),
176 OPC_RGETPOS = _OPC(5, 26),
177 OPC_RGETINFO = _OPC(5, 27),
178
179 /* category 6: */
180 OPC_LDG = _OPC(6, 0), /* load-global */
181 OPC_LDL = _OPC(6, 1),
182 OPC_LDP = _OPC(6, 2),
183 OPC_STG = _OPC(6, 3), /* store-global */
184 OPC_STL = _OPC(6, 4),
185 OPC_STP = _OPC(6, 5),
186 OPC_LDIB = _OPC(6, 6),
187 OPC_G2L = _OPC(6, 7),
188 OPC_L2G = _OPC(6, 8),
189 OPC_PREFETCH = _OPC(6, 9),
190 OPC_LDLW = _OPC(6, 10),
191 OPC_STLW = _OPC(6, 11),
192 OPC_RESFMT = _OPC(6, 14),
193 OPC_RESINFO = _OPC(6, 15),
194 OPC_ATOMIC_ADD = _OPC(6, 16),
195 OPC_ATOMIC_SUB = _OPC(6, 17),
196 OPC_ATOMIC_XCHG = _OPC(6, 18),
197 OPC_ATOMIC_INC = _OPC(6, 19),
198 OPC_ATOMIC_DEC = _OPC(6, 20),
199 OPC_ATOMIC_CMPXCHG = _OPC(6, 21),
200 OPC_ATOMIC_MIN = _OPC(6, 22),
201 OPC_ATOMIC_MAX = _OPC(6, 23),
202 OPC_ATOMIC_AND = _OPC(6, 24),
203 OPC_ATOMIC_OR = _OPC(6, 25),
204 OPC_ATOMIC_XOR = _OPC(6, 26),
205 OPC_LDGB = _OPC(6, 27),
206 OPC_STGB = _OPC(6, 28),
207 OPC_STIB = _OPC(6, 29),
208 OPC_LDC = _OPC(6, 30),
209 OPC_LDLV = _OPC(6, 31),
210
211 /* category 7: */
212 OPC_BAR = _OPC(7, 0),
213 OPC_FENCE = _OPC(7, 1),
214
215 /* meta instructions (category -1): */
216 /* placeholder instr to mark shader inputs: */
217 OPC_META_INPUT = _OPC(-1, 0),
218 /* The "collect" and "split" instructions are used for keeping
219 * track of instructions that write to multiple dst registers
220 * (split) like texture sample instructions, or read multiple
221 * consecutive scalar registers (collect) (bary.f, texture samp)
222 *
223 * A "split" extracts a scalar component from a vecN, and a
224 * "collect" gathers multiple scalar components into a vecN
225 */
226 OPC_META_SPLIT = _OPC(-1, 2),
227 OPC_META_COLLECT = _OPC(-1, 3),
228
229 /* placeholder for texture fetches that run before FS invocation
230 * starts:
231 */
232 OPC_META_TEX_PREFETCH = _OPC(-1, 4),
233
234 } opc_t;
235
236 #define opc_cat(opc) ((int)((opc) >> NOPC_BITS))
237 #define opc_op(opc) ((unsigned)((opc) & ((1 << NOPC_BITS) - 1)))
238
239 typedef enum {
240 TYPE_F16 = 0,
241 TYPE_F32 = 1,
242 TYPE_U16 = 2,
243 TYPE_U32 = 3,
244 TYPE_S16 = 4,
245 TYPE_S32 = 5,
246 TYPE_U8 = 6,
247 TYPE_S8 = 7, // XXX I assume?
248 } type_t;
249
250 static inline uint32_t type_size(type_t type)
251 {
252 switch (type) {
253 case TYPE_F32:
254 case TYPE_U32:
255 case TYPE_S32:
256 return 32;
257 case TYPE_F16:
258 case TYPE_U16:
259 case TYPE_S16:
260 return 16;
261 case TYPE_U8:
262 case TYPE_S8:
263 return 8;
264 default:
265 assert(0); /* invalid type */
266 return 0;
267 }
268 }
269
270 static inline int type_float(type_t type)
271 {
272 return (type == TYPE_F32) || (type == TYPE_F16);
273 }
274
275 static inline int type_uint(type_t type)
276 {
277 return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8);
278 }
279
280 static inline int type_sint(type_t type)
281 {
282 return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8);
283 }
284
285 typedef union PACKED {
286 /* normal gpr or const src register: */
287 struct PACKED {
288 uint32_t comp : 2;
289 uint32_t num : 10;
290 };
291 /* for immediate val: */
292 int32_t iim_val : 11;
293 /* to make compiler happy: */
294 uint32_t dummy32;
295 uint32_t dummy10 : 10;
296 int32_t idummy10 : 10;
297 uint32_t dummy11 : 11;
298 uint32_t dummy12 : 12;
299 uint32_t dummy13 : 13;
300 uint32_t dummy8 : 8;
301 int32_t idummy13 : 13;
302 int32_t idummy8 : 8;
303 } reg_t;
304
305 /* special registers: */
306 #define REG_A0 61 /* address register */
307 #define REG_P0 62 /* predicate register */
308
309 static inline int reg_special(reg_t reg)
310 {
311 return (reg.num == REG_A0) || (reg.num == REG_P0);
312 }
313
314 typedef struct PACKED {
315 /* dword0: */
316 union PACKED {
317 struct PACKED {
318 int16_t immed : 16;
319 uint32_t dummy1 : 16;
320 } a3xx;
321 struct PACKED {
322 int32_t immed : 20;
323 uint32_t dummy1 : 12;
324 } a4xx;
325 struct PACKED {
326 int32_t immed : 32;
327 } a5xx;
328 };
329
330 /* dword1: */
331 uint32_t dummy2 : 8;
332 uint32_t repeat : 3;
333 uint32_t dummy3 : 1;
334 uint32_t ss : 1;
335 uint32_t dummy4 : 7;
336 uint32_t inv : 1;
337 uint32_t comp : 2;
338 uint32_t opc : 4;
339 uint32_t jmp_tgt : 1;
340 uint32_t sync : 1;
341 uint32_t opc_cat : 3;
342 } instr_cat0_t;
343
344 typedef struct PACKED {
345 /* dword0: */
346 union PACKED {
347 /* for normal src register: */
348 struct PACKED {
349 uint32_t src : 11;
350 /* at least low bit of pad must be zero or it will
351 * look like a address relative src
352 */
353 uint32_t pad : 21;
354 };
355 /* for address relative: */
356 struct PACKED {
357 int32_t off : 10;
358 uint32_t src_rel_c : 1;
359 uint32_t src_rel : 1;
360 uint32_t unknown : 20;
361 };
362 /* for immediate: */
363 int32_t iim_val;
364 uint32_t uim_val;
365 float fim_val;
366 };
367
368 /* dword1: */
369 uint32_t dst : 8;
370 uint32_t repeat : 3;
371 uint32_t src_r : 1;
372 uint32_t ss : 1;
373 uint32_t ul : 1;
374 uint32_t dst_type : 3;
375 uint32_t dst_rel : 1;
376 uint32_t src_type : 3;
377 uint32_t src_c : 1;
378 uint32_t src_im : 1;
379 uint32_t even : 1;
380 uint32_t pos_inf : 1;
381 uint32_t must_be_0 : 2;
382 uint32_t jmp_tgt : 1;
383 uint32_t sync : 1;
384 uint32_t opc_cat : 3;
385 } instr_cat1_t;
386
387 typedef struct PACKED {
388 /* dword0: */
389 union PACKED {
390 struct PACKED {
391 uint32_t src1 : 11;
392 uint32_t must_be_zero1: 2;
393 uint32_t src1_im : 1; /* immediate */
394 uint32_t src1_neg : 1; /* negate */
395 uint32_t src1_abs : 1; /* absolute value */
396 };
397 struct PACKED {
398 uint32_t src1 : 10;
399 uint32_t src1_c : 1; /* relative-const */
400 uint32_t src1_rel : 1; /* relative address */
401 uint32_t must_be_zero : 1;
402 uint32_t dummy : 3;
403 } rel1;
404 struct PACKED {
405 uint32_t src1 : 12;
406 uint32_t src1_c : 1; /* const */
407 uint32_t dummy : 3;
408 } c1;
409 };
410
411 union PACKED {
412 struct PACKED {
413 uint32_t src2 : 11;
414 uint32_t must_be_zero2: 2;
415 uint32_t src2_im : 1; /* immediate */
416 uint32_t src2_neg : 1; /* negate */
417 uint32_t src2_abs : 1; /* absolute value */
418 };
419 struct PACKED {
420 uint32_t src2 : 10;
421 uint32_t src2_c : 1; /* relative-const */
422 uint32_t src2_rel : 1; /* relative address */
423 uint32_t must_be_zero : 1;
424 uint32_t dummy : 3;
425 } rel2;
426 struct PACKED {
427 uint32_t src2 : 12;
428 uint32_t src2_c : 1; /* const */
429 uint32_t dummy : 3;
430 } c2;
431 };
432
433 /* dword1: */
434 uint32_t dst : 8;
435 uint32_t repeat : 2;
436 uint32_t sat : 1;
437 uint32_t src1_r : 1; /* doubles as nop0 if repeat==0 */
438 uint32_t ss : 1;
439 uint32_t ul : 1; /* dunno */
440 uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
441 uint32_t ei : 1;
442 uint32_t cond : 3;
443 uint32_t src2_r : 1; /* doubles as nop1 if repeat==0 */
444 uint32_t full : 1; /* not half */
445 uint32_t opc : 6;
446 uint32_t jmp_tgt : 1;
447 uint32_t sync : 1;
448 uint32_t opc_cat : 3;
449 } instr_cat2_t;
450
451 typedef struct PACKED {
452 /* dword0: */
453 union PACKED {
454 struct PACKED {
455 uint32_t src1 : 11;
456 uint32_t must_be_zero1: 2;
457 uint32_t src2_c : 1;
458 uint32_t src1_neg : 1;
459 uint32_t src2_r : 1; /* doubles as nop1 if repeat==0 */
460 };
461 struct PACKED {
462 uint32_t src1 : 10;
463 uint32_t src1_c : 1;
464 uint32_t src1_rel : 1;
465 uint32_t must_be_zero : 1;
466 uint32_t dummy : 3;
467 } rel1;
468 struct PACKED {
469 uint32_t src1 : 12;
470 uint32_t src1_c : 1;
471 uint32_t dummy : 3;
472 } c1;
473 };
474
475 union PACKED {
476 struct PACKED {
477 uint32_t src3 : 11;
478 uint32_t must_be_zero2: 2;
479 uint32_t src3_r : 1;
480 uint32_t src2_neg : 1;
481 uint32_t src3_neg : 1;
482 };
483 struct PACKED {
484 uint32_t src3 : 10;
485 uint32_t src3_c : 1;
486 uint32_t src3_rel : 1;
487 uint32_t must_be_zero : 1;
488 uint32_t dummy : 3;
489 } rel2;
490 struct PACKED {
491 uint32_t src3 : 12;
492 uint32_t src3_c : 1;
493 uint32_t dummy : 3;
494 } c2;
495 };
496
497 /* dword1: */
498 uint32_t dst : 8;
499 uint32_t repeat : 2;
500 uint32_t sat : 1;
501 uint32_t src1_r : 1; /* doubles as nop0 if repeat==0 */
502 uint32_t ss : 1;
503 uint32_t ul : 1;
504 uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
505 uint32_t src2 : 8;
506 uint32_t opc : 4;
507 uint32_t jmp_tgt : 1;
508 uint32_t sync : 1;
509 uint32_t opc_cat : 3;
510 } instr_cat3_t;
511
512 static inline bool instr_cat3_full(instr_cat3_t *cat3)
513 {
514 switch (_OPC(3, cat3->opc)) {
515 case OPC_MAD_F16:
516 case OPC_MAD_U16:
517 case OPC_MAD_S16:
518 case OPC_SEL_B16:
519 case OPC_SEL_S16:
520 case OPC_SEL_F16:
521 case OPC_SAD_S16:
522 case OPC_SAD_S32: // really??
523 return false;
524 default:
525 return true;
526 }
527 }
528
529 typedef struct PACKED {
530 /* dword0: */
531 union PACKED {
532 struct PACKED {
533 uint32_t src : 11;
534 uint32_t must_be_zero1: 2;
535 uint32_t src_im : 1; /* immediate */
536 uint32_t src_neg : 1; /* negate */
537 uint32_t src_abs : 1; /* absolute value */
538 };
539 struct PACKED {
540 uint32_t src : 10;
541 uint32_t src_c : 1; /* relative-const */
542 uint32_t src_rel : 1; /* relative address */
543 uint32_t must_be_zero : 1;
544 uint32_t dummy : 3;
545 } rel;
546 struct PACKED {
547 uint32_t src : 12;
548 uint32_t src_c : 1; /* const */
549 uint32_t dummy : 3;
550 } c;
551 };
552 uint32_t dummy1 : 16; /* seem to be ignored */
553
554 /* dword1: */
555 uint32_t dst : 8;
556 uint32_t repeat : 2;
557 uint32_t sat : 1;
558 uint32_t src_r : 1;
559 uint32_t ss : 1;
560 uint32_t ul : 1;
561 uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
562 uint32_t dummy2 : 5; /* seem to be ignored */
563 uint32_t full : 1; /* not half */
564 uint32_t opc : 6;
565 uint32_t jmp_tgt : 1;
566 uint32_t sync : 1;
567 uint32_t opc_cat : 3;
568 } instr_cat4_t;
569
570 /* With is_bindless_s2en = 1, this determines whether bindless is enabled and
571 * if so, how to get the (base, index) pair for both sampler and texture.
572 * There is a single base embedded in the instruction, which is always used
573 * for the texture.
574 */
575 typedef enum {
576 /* Use traditional GL binding model, get texture and sampler index
577 * from src3 which is not presumed to be uniform. This is
578 * backwards-compatible with earlier generations, where this field was
579 * always 0 and nonuniform-indexed sampling always worked.
580 */
581 CAT5_NONUNIFORM = 0,
582
583 /* The sampler base comes from the low 3 bits of a1.x, and the sampler
584 * and texture index come from src3 which is presumed to be uniform.
585 */
586 CAT5_BINDLESS_A1_UNIFORM = 1,
587
588 /* The texture and sampler share the same base, and the sampler and
589 * texture index come from src3 which is *not* presumed to be uniform.
590 */
591 CAT5_BINDLESS_NONUNIFORM = 2,
592
593 /* The sampler base comes from the low 3 bits of a1.x, and the sampler
594 * and texture index come from src3 which is *not* presumed to be
595 * uniform.
596 */
597 CAT5_BINDLESS_A1_NONUNIFORM = 3,
598
599 /* Use traditional GL binding model, get texture and sampler index
600 * from src3 which is presumed to be uniform.
601 */
602 CAT5_UNIFORM = 4,
603
604 /* The texture and sampler share the same base, and the sampler and
605 * texture index come from src3 which is presumed to be uniform.
606 */
607 CAT5_BINDLESS_UNIFORM = 5,
608
609 /* The texture and sampler share the same base, get sampler index from low
610 * 4 bits of src3 and texture index from high 4 bits.
611 */
612 CAT5_BINDLESS_IMM = 6,
613
614 /* The sampler base comes from the low 3 bits of a1.x, and the texture
615 * index comes from the next 8 bits of a1.x. The sampler index is an
616 * immediate in src3.
617 */
618 CAT5_BINDLESS_A1_IMM = 7,
619 } cat5_desc_mode_t;
620
621 typedef struct PACKED {
622 /* dword0: */
623 union PACKED {
624 /* normal case: */
625 struct PACKED {
626 uint32_t full : 1; /* not half */
627 uint32_t src1 : 8;
628 uint32_t src2 : 8;
629 uint32_t dummy1 : 4; /* seem to be ignored */
630 uint32_t samp : 4;
631 uint32_t tex : 7;
632 } norm;
633 /* s2en case: */
634 struct PACKED {
635 uint32_t full : 1; /* not half */
636 uint32_t src1 : 8;
637 uint32_t src2 : 8;
638 uint32_t dummy1 : 2;
639 uint32_t base_hi : 2;
640 uint32_t src3 : 8;
641 uint32_t desc_mode : 3;
642 } s2en_bindless;
643 /* same in either case: */
644 // XXX I think, confirm this
645 struct PACKED {
646 uint32_t full : 1; /* not half */
647 uint32_t src1 : 8;
648 uint32_t src2 : 8;
649 uint32_t pad : 15;
650 };
651 };
652
653 /* dword1: */
654 uint32_t dst : 8;
655 uint32_t wrmask : 4; /* write-mask */
656 uint32_t type : 3;
657 uint32_t base_lo : 1; /* used with bindless */
658 uint32_t is_3d : 1;
659
660 uint32_t is_a : 1;
661 uint32_t is_s : 1;
662 uint32_t is_s2en_bindless : 1;
663 uint32_t is_o : 1;
664 uint32_t is_p : 1;
665
666 uint32_t opc : 5;
667 uint32_t jmp_tgt : 1;
668 uint32_t sync : 1;
669 uint32_t opc_cat : 3;
670 } instr_cat5_t;
671
672 /* dword0 encoding for src_off: [src1 + off], src2: */
673 typedef struct PACKED {
674 /* dword0: */
675 uint32_t mustbe1 : 1;
676 int32_t off : 13;
677 uint32_t src1 : 8;
678 uint32_t src1_im : 1;
679 uint32_t src2_im : 1;
680 uint32_t src2 : 8;
681
682 /* dword1: */
683 uint32_t dword1;
684 } instr_cat6a_t;
685
686 /* dword0 encoding for !src_off: [src1], src2 */
687 typedef struct PACKED {
688 /* dword0: */
689 uint32_t mustbe0 : 1;
690 uint32_t src1 : 13;
691 uint32_t ignore0 : 8;
692 uint32_t src1_im : 1;
693 uint32_t src2_im : 1;
694 uint32_t src2 : 8;
695
696 /* dword1: */
697 uint32_t dword1;
698 } instr_cat6b_t;
699
700 /* dword1 encoding for dst_off: */
701 typedef struct PACKED {
702 /* dword0: */
703 uint32_t dword0;
704
705 /* note: there is some weird stuff going on where sometimes
706 * cat6->a.off is involved.. but that seems like a bug in
707 * the blob, since it is used even if !cat6->src_off
708 * It would make sense for there to be some more bits to
709 * bring us to 11 bits worth of offset, but not sure..
710 */
711 int32_t off : 8;
712 uint32_t mustbe1 : 1;
713 uint32_t dst : 8;
714 uint32_t pad1 : 15;
715 } instr_cat6c_t;
716
717 /* dword1 encoding for !dst_off: */
718 typedef struct PACKED {
719 /* dword0: */
720 uint32_t dword0;
721
722 uint32_t dst : 8;
723 uint32_t mustbe0 : 1;
724 uint32_t idx : 8;
725 uint32_t pad0 : 15;
726 } instr_cat6d_t;
727
728 /* ldgb and atomics..
729 *
730 * ldgb: pad0=0, pad3=1
731 * atomic .g: pad0=1, pad3=1
732 * .l: pad0=1, pad3=0
733 */
734 typedef struct PACKED {
735 /* dword0: */
736 uint32_t pad0 : 1;
737 uint32_t src3 : 8;
738 uint32_t d : 2;
739 uint32_t typed : 1;
740 uint32_t type_size : 2;
741 uint32_t src1 : 8;
742 uint32_t src1_im : 1;
743 uint32_t src2_im : 1;
744 uint32_t src2 : 8;
745
746 /* dword1: */
747 uint32_t dst : 8;
748 uint32_t mustbe0 : 1;
749 uint32_t src_ssbo : 8;
750 uint32_t pad2 : 3; // type
751 uint32_t g : 1;
752 uint32_t pad3 : 1;
753 uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat
754 } instr_cat6ldgb_t;
755
756 /* stgb, pad0=0, pad3=2
757 */
758 typedef struct PACKED {
759 /* dword0: */
760 uint32_t mustbe1 : 1; // ???
761 uint32_t src1 : 8;
762 uint32_t d : 2;
763 uint32_t typed : 1;
764 uint32_t type_size : 2;
765 uint32_t pad0 : 9;
766 uint32_t src2_im : 1;
767 uint32_t src2 : 8;
768
769 /* dword1: */
770 uint32_t src3 : 8;
771 uint32_t src3_im : 1;
772 uint32_t dst_ssbo : 8;
773 uint32_t pad2 : 3; // type
774 uint32_t pad3 : 2;
775 uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat
776 } instr_cat6stgb_t;
777
778 typedef union PACKED {
779 instr_cat6a_t a;
780 instr_cat6b_t b;
781 instr_cat6c_t c;
782 instr_cat6d_t d;
783 instr_cat6ldgb_t ldgb;
784 instr_cat6stgb_t stgb;
785 struct PACKED {
786 /* dword0: */
787 uint32_t src_off : 1;
788 uint32_t pad1 : 31;
789
790 /* dword1: */
791 uint32_t pad2 : 8;
792 uint32_t dst_off : 1;
793 uint32_t pad3 : 8;
794 uint32_t type : 3;
795 uint32_t g : 1; /* or in some cases it means dst immed */
796 uint32_t pad4 : 1;
797 uint32_t opc : 5;
798 uint32_t jmp_tgt : 1;
799 uint32_t sync : 1;
800 uint32_t opc_cat : 3;
801 };
802 } instr_cat6_t;
803
804 /* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
805 */
806 typedef enum {
807 /* Use old GL binding model with an immediate index.
808 * TODO: find CAT6_UNIFORM and CAT6_NONUNIFORM
809 */
810 CAT6_IMM = 0,
811
812 /* Use the bindless model, with an immediate index.
813 */
814 CAT6_BINDLESS_IMM = 4,
815
816 /* Use the bindless model, with a uniform register index.
817 */
818 CAT6_BINDLESS_UNIFORM = 5,
819
820 /* Use the bindless model, with a register index that isn't guaranteed
821 * to be uniform. This presumably checks if the indices are equal and
822 * splits up the load/store, because it works the way you would
823 * expect.
824 */
825 CAT6_BINDLESS_NONUNIFORM = 6,
826 } cat6_desc_mode_t;
827
828 /**
829 * For atomic ops (which return a value):
830 *
831 * pad1=1, pad3=c, pad5=3
832 * src1 - vecN offset/coords
833 * src2.x - is actually dest register
834 * src2.y - is 'data' except for cmpxchg where src2.y is 'compare'
835 * and src2.z is 'data'
836 *
837 * For stib (which does not return a value):
838 * pad1=0, pad3=c, pad5=2
839 * src1 - vecN offset/coords
840 * src2 - value to store
841 *
842 * For ldib:
843 * pad1=1, pad3=c, pad5=2
844 * src1 - vecN offset/coords
845 *
846 * for ldc (load from UBO using descriptor):
847 * pad1=0, pad3=8, pad5=2
848 *
849 * pad2 and pad5 are only observed to be 0.
850 */
851 typedef struct PACKED {
852 /* dword0: */
853 uint32_t pad1 : 1;
854 uint32_t base : 3;
855 uint32_t pad2 : 2;
856 uint32_t desc_mode : 3;
857 uint32_t d : 2;
858 uint32_t typed : 1;
859 uint32_t type_size : 2;
860 uint32_t opc : 5;
861 uint32_t pad3 : 5;
862 uint32_t src1 : 8; /* coordinate/offset */
863
864 /* dword1: */
865 uint32_t src2 : 8; /* or the dst for load instructions */
866 uint32_t pad4 : 1; //mustbe0 ??
867 uint32_t ssbo : 8; /* ssbo/image binding point */
868 uint32_t type : 3;
869 uint32_t pad5 : 7;
870 uint32_t jmp_tgt : 1;
871 uint32_t sync : 1;
872 uint32_t opc_cat : 3;
873 } instr_cat6_a6xx_t;
874
875 typedef struct PACKED {
876 /* dword0: */
877 uint32_t pad1 : 32;
878
879 /* dword1: */
880 uint32_t pad2 : 12;
881 uint32_t ss : 1; /* maybe in the encoding, but blob only uses (sy) */
882 uint32_t pad3 : 6;
883 uint32_t w : 1; /* write */
884 uint32_t r : 1; /* read */
885 uint32_t l : 1; /* local */
886 uint32_t g : 1; /* global */
887 uint32_t opc : 4; /* presumed, but only a couple known OPCs */
888 uint32_t jmp_tgt : 1; /* (jp) */
889 uint32_t sync : 1; /* (sy) */
890 uint32_t opc_cat : 3;
891 } instr_cat7_t;
892
893 typedef union PACKED {
894 instr_cat0_t cat0;
895 instr_cat1_t cat1;
896 instr_cat2_t cat2;
897 instr_cat3_t cat3;
898 instr_cat4_t cat4;
899 instr_cat5_t cat5;
900 instr_cat6_t cat6;
901 instr_cat6_a6xx_t cat6_a6xx;
902 instr_cat7_t cat7;
903 struct PACKED {
904 /* dword0: */
905 uint32_t pad1 : 32;
906
907 /* dword1: */
908 uint32_t pad2 : 12;
909 uint32_t ss : 1; /* cat1-cat4 (cat0??) and cat7 (?) */
910 uint32_t ul : 1; /* cat2-cat4 (and cat1 in blob.. which may be bug??) */
911 uint32_t pad3 : 13;
912 uint32_t jmp_tgt : 1;
913 uint32_t sync : 1;
914 uint32_t opc_cat : 3;
915
916 };
917 } instr_t;
918
919 static inline uint32_t instr_repeat(instr_t *instr)
920 {
921 switch (instr->opc_cat) {
922 case 0: return instr->cat0.repeat;
923 case 1: return instr->cat1.repeat;
924 case 2: return instr->cat2.repeat;
925 case 3: return instr->cat3.repeat;
926 case 4: return instr->cat4.repeat;
927 default: return 0;
928 }
929 }
930
931 static inline bool instr_sat(instr_t *instr)
932 {
933 switch (instr->opc_cat) {
934 case 2: return instr->cat2.sat;
935 case 3: return instr->cat3.sat;
936 case 4: return instr->cat4.sat;
937 default: return false;
938 }
939 }
940
941 /* We can probably drop the gpu_id arg, but keeping it for now so we can
942 * assert if we see something we think should be new encoding on an older
943 * gpu.
944 */
945 static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
946 {
947 instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
948
949 /* At least one of these two bits is pad in all the possible
950 * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
951 * cmdstream traces I have indicates that the pad bit is zero
952 * in all cases. So we can use this to detect new encoding:
953 */
954 if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) {
955 assert(gpu_id >= 600);
956 assert(instr->cat6.opc == 0);
957 return false;
958 }
959
960 return true;
961 }
962
963 static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
964 {
965 switch (instr->opc_cat) {
966 case 0: return instr->cat0.opc;
967 case 1: return 0;
968 case 2: return instr->cat2.opc;
969 case 3: return instr->cat3.opc;
970 case 4: return instr->cat4.opc;
971 case 5: return instr->cat5.opc;
972 case 6:
973 if (!is_cat6_legacy(instr, gpu_id))
974 return instr->cat6_a6xx.opc;
975 return instr->cat6.opc;
976 case 7: return instr->cat7.opc;
977 default: return 0;
978 }
979 }
980
981 static inline bool is_mad(opc_t opc)
982 {
983 switch (opc) {
984 case OPC_MAD_U16:
985 case OPC_MAD_S16:
986 case OPC_MAD_U24:
987 case OPC_MAD_S24:
988 case OPC_MAD_F16:
989 case OPC_MAD_F32:
990 return true;
991 default:
992 return false;
993 }
994 }
995
996 static inline bool is_madsh(opc_t opc)
997 {
998 switch (opc) {
999 case OPC_MADSH_U16:
1000 case OPC_MADSH_M16:
1001 return true;
1002 default:
1003 return false;
1004 }
1005 }
1006
1007 static inline bool is_atomic(opc_t opc)
1008 {
1009 switch (opc) {
1010 case OPC_ATOMIC_ADD:
1011 case OPC_ATOMIC_SUB:
1012 case OPC_ATOMIC_XCHG:
1013 case OPC_ATOMIC_INC:
1014 case OPC_ATOMIC_DEC:
1015 case OPC_ATOMIC_CMPXCHG:
1016 case OPC_ATOMIC_MIN:
1017 case OPC_ATOMIC_MAX:
1018 case OPC_ATOMIC_AND:
1019 case OPC_ATOMIC_OR:
1020 case OPC_ATOMIC_XOR:
1021 return true;
1022 default:
1023 return false;
1024 }
1025 }
1026
1027 static inline bool is_ssbo(opc_t opc)
1028 {
1029 switch (opc) {
1030 case OPC_RESFMT:
1031 case OPC_RESINFO:
1032 case OPC_LDGB:
1033 case OPC_STGB:
1034 case OPC_STIB:
1035 return true;
1036 default:
1037 return false;
1038 }
1039 }
1040
1041 static inline bool is_isam(opc_t opc)
1042 {
1043 switch (opc) {
1044 case OPC_ISAM:
1045 case OPC_ISAML:
1046 case OPC_ISAMM:
1047 return true;
1048 default:
1049 return false;
1050 }
1051 }
1052
1053
1054 static inline bool is_cat2_float(opc_t opc)
1055 {
1056 switch (opc) {
1057 case OPC_ADD_F:
1058 case OPC_MIN_F:
1059 case OPC_MAX_F:
1060 case OPC_MUL_F:
1061 case OPC_SIGN_F:
1062 case OPC_CMPS_F:
1063 case OPC_ABSNEG_F:
1064 case OPC_CMPV_F:
1065 case OPC_FLOOR_F:
1066 case OPC_CEIL_F:
1067 case OPC_RNDNE_F:
1068 case OPC_RNDAZ_F:
1069 case OPC_TRUNC_F:
1070 return true;
1071
1072 default:
1073 return false;
1074 }
1075 }
1076
1077 static inline bool is_cat3_float(opc_t opc)
1078 {
1079 switch (opc) {
1080 case OPC_MAD_F16:
1081 case OPC_MAD_F32:
1082 case OPC_SEL_F16:
1083 case OPC_SEL_F32:
1084 return true;
1085 default:
1086 return false;
1087 }
1088 }
1089
1090 int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
1091
1092 #endif /* INSTR_A3XX_H_ */