pan/bit: Use swizzle helper for round
[mesa.git] / src / panfrost / bifrost / test / bi_test_pack.c
1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "bit.h"
28 #include "bi_print.h"
29 #include "util/half_float.h"
30 #include "bifrost/disassemble.h"
31
32 /* Instruction packing tests */
33
34 static void
35 bit_test_single(struct panfrost_device *dev,
36 bi_instruction *ins,
37 uint32_t input[4],
38 bool fma, enum bit_debug debug)
39 {
40 /* First, simulate the instruction */
41 struct bit_state s = { 0 };
42 memcpy(s.r, input, 16);
43 bit_step(&s, ins, fma);
44
45 /* Next, wrap it up and pack it */
46
47 bi_instruction ldubo = {
48 .type = BI_LOAD_UNIFORM,
49 .src = {
50 BIR_INDEX_CONSTANT,
51 BIR_INDEX_ZERO
52 },
53 .src_types = {
54 nir_type_uint32,
55 nir_type_uint32,
56 },
57 .dest = BIR_INDEX_REGISTER | 0,
58 .dest_type = nir_type_uint32,
59 .vector_channels = 4,
60 };
61
62 bi_instruction ldva = {
63 .type = BI_LOAD_VAR_ADDRESS,
64 .vector_channels = 3,
65 .dest = BIR_INDEX_REGISTER | 32,
66 .dest_type = nir_type_uint32,
67 .src = {
68 BIR_INDEX_CONSTANT,
69 BIR_INDEX_REGISTER | 61,
70 BIR_INDEX_REGISTER | 62,
71 0,
72 },
73 .src_types = {
74 nir_type_uint32,
75 nir_type_uint32,
76 nir_type_uint32,
77 nir_type_uint32,
78 }
79 };
80
81 bi_instruction st = {
82 .type = BI_STORE_VAR,
83 .src = {
84 BIR_INDEX_REGISTER | 0,
85 ldva.dest, ldva.dest + 1, ldva.dest + 2,
86 },
87 .src_types = {
88 nir_type_uint32,
89 nir_type_uint32, nir_type_uint32, nir_type_uint32,
90 },
91 .vector_channels = 4
92 };
93
94 bi_context *ctx = rzalloc(NULL, bi_context);
95 ctx->stage = MESA_SHADER_VERTEX;
96
97 bi_block *blk = rzalloc(ctx, bi_block);
98 blk->scheduled = true;
99
100 blk->base.predecessors = _mesa_set_create(blk,
101 _mesa_hash_pointer,
102 _mesa_key_pointer_equal);
103
104 list_inithead(&ctx->blocks);
105 list_addtail(&blk->base.link, &ctx->blocks);
106 list_inithead(&blk->clauses);
107
108 bi_clause *clauses[4] = {
109 rzalloc(ctx, bi_clause),
110 rzalloc(ctx, bi_clause),
111 rzalloc(ctx, bi_clause),
112 rzalloc(ctx, bi_clause)
113 };
114
115 for (unsigned i = 0; i < 4; ++i) {
116 clauses[i]->bundle_count = 1;
117 list_addtail(&clauses[i]->link, &blk->clauses);
118 clauses[i]->scoreboard_id = (i & 1);
119
120 if (i) {
121 clauses[i]->dependencies = 1 << (~i & 1);
122 clauses[i]->data_register_write_barrier = true;
123 }
124 }
125
126 clauses[0]->bundles[0].add = &ldubo;
127 clauses[0]->clause_type = BIFROST_CLAUSE_UBO;
128
129 if (fma)
130 clauses[1]->bundles[0].fma = ins;
131 else
132 clauses[1]->bundles[0].add = ins;
133
134 clauses[0]->constant_count = 1;
135 clauses[1]->constant_count = 1;
136 clauses[1]->constants[0] = ins->constant.u64;
137
138 clauses[2]->bundles[0].add = &ldva;
139 clauses[3]->bundles[0].add = &st;
140
141 clauses[2]->clause_type = BIFROST_CLAUSE_UBO;
142 clauses[3]->clause_type = BIFROST_CLAUSE_SSBO_STORE;
143
144 panfrost_program prog;
145 bi_pack(ctx, &prog.compiled);
146
147 bool succ = bit_vertex(dev, prog, input, 16, NULL, 0,
148 s.r, 16, debug);
149
150 if (debug >= BIT_DEBUG_ALL || (!succ && debug >= BIT_DEBUG_FAIL)) {
151 bi_print_shader(ctx, stderr);
152 disassemble_bifrost(stderr, prog.compiled.data, prog.compiled.size, true);
153 }
154
155 if (!succ)
156 fprintf(stderr, "FAIL\n");
157 }
158
159 /* Utilities for generating tests */
160
161 static void
162 bit_generate_float4(float *mem)
163 {
164 for (unsigned i = 0; i < 4; ++i)
165 mem[i] = (float) ((rand() & 255) - 127) / 16.0;
166 }
167
168 static void
169 bit_generate_half8(uint16_t *mem)
170 {
171 for (unsigned i = 0; i < 8; ++i)
172 mem[i] = _mesa_float_to_half(((float) (rand() & 255) - 127) / 16.0);
173 }
174
175 static bi_instruction
176 bit_ins(enum bi_class C, unsigned argc, nir_alu_type base, unsigned size)
177 {
178 nir_alu_type T = base | size;
179
180 bi_instruction ins = {
181 .type = C,
182 .dest = BIR_INDEX_REGISTER | 0,
183 .dest_type = T,
184 };
185
186 for (unsigned i = 0; i < argc; ++i) {
187 ins.src[i] = BIR_INDEX_REGISTER | i;
188 ins.src_types[i] = T;
189 }
190
191 return ins;
192 }
193
194 #define BIT_FOREACH_SWIZZLE(swz, args, sz) \
195 for (unsigned swz = 0; swz < ((sz == 16) ? (1 << (2 * args)) : 1); ++swz)
196
197 static void
198 bit_apply_swizzle(bi_instruction *ins, unsigned swz, unsigned args, unsigned sz)
199 {
200 unsigned slots_per_arg = (sz == 16) ? 4 : 1;
201 unsigned slots_per_chan = (sz == 16) ? 1 : 0;
202 unsigned mask = (sz == 16) ? 1 : 0;
203
204 for (unsigned i = 0; i < args; ++i) {
205 for (unsigned j = 0; j < (32 / sz); ++j) {
206 ins->swizzle[i][j] = ((swz >> (slots_per_arg * i)) >> (slots_per_chan * j)) & mask;
207 }
208 }
209 }
210
211 /* Tests all 64 combinations of floating point modifiers for a given
212 * instruction / floating-type / test type */
213
214 static void
215 bit_fmod_helper(struct panfrost_device *dev,
216 enum bi_class c, unsigned size, bool fma,
217 uint32_t *input, enum bit_debug debug, unsigned op)
218 {
219 bi_instruction ins = bit_ins(c, 2, nir_type_float, size);
220
221 bool fp16 = (size == 16);
222 bool has_outmods = fma || !fp16;
223
224 for (unsigned outmod = 0; outmod < (has_outmods ? 4 : 1); ++outmod) {
225 BIT_FOREACH_SWIZZLE(swz, 2, size) {
226 for (unsigned inmod = 0; inmod < 16; ++inmod) {
227 ins.outmod = outmod;
228 ins.op.minmax = op;
229 ins.src_abs[0] = (inmod & 0x1);
230 ins.src_abs[1] = (inmod & 0x2);
231 ins.src_neg[0] = (inmod & 0x4);
232 ins.src_neg[1] = (inmod & 0x8);
233 bit_apply_swizzle(&ins, swz, 2, size);
234 bit_test_single(dev, &ins, input, fma, debug);
235 }
236 }
237 }
238 }
239
240 static void
241 bit_fma_helper(struct panfrost_device *dev,
242 unsigned size, uint32_t *input, enum bit_debug debug)
243 {
244 bi_instruction ins = bit_ins(BI_FMA, 3, nir_type_float, size);
245
246 for (unsigned outmod = 0; outmod < 4; ++outmod) {
247 for (unsigned inmod = 0; inmod < 8; ++inmod) {
248 ins.outmod = outmod;
249 ins.src_neg[0] = (inmod & 0x1);
250 ins.src_neg[1] = (inmod & 0x2);
251 ins.src_neg[2] = (inmod & 0x4);
252 bit_test_single(dev, &ins, input, true, debug);
253 }
254 }
255 }
256
257 static void
258 bit_fma_mscale_helper(struct panfrost_device *dev, uint32_t *input, enum bit_debug debug)
259 {
260 bi_instruction ins = bit_ins(BI_FMA, 4, nir_type_float, 32);
261 ins.op.mscale = true;
262 ins.src_types[3] = nir_type_int32;
263 ins.src[2] = ins.src[3]; /* Not enough ports! */
264
265 for (unsigned outmod = 0; outmod < 4; ++outmod) {
266 for (unsigned inmod = 0; inmod < 8; ++inmod) {
267 ins.outmod = outmod;
268 ins.src_abs[0] = (inmod & 0x1);
269 ins.src_neg[1] = (inmod & 0x2);
270 ins.src_neg[2] = (inmod & 0x4);
271 bit_test_single(dev, &ins, input, true, debug);
272 }
273 }
274 }
275
276 static void
277 bit_csel_helper(struct panfrost_device *dev,
278 unsigned size, uint32_t *input, enum bit_debug debug)
279 {
280 bi_instruction ins = bit_ins(BI_CSEL, 4, nir_type_uint, size);
281
282 /* SCHEDULER: We can only read 3 registers at once. */
283 ins.src[2] = ins.src[0];
284
285 for (enum bi_cond cond = BI_COND_LT; cond <= BI_COND_NE; ++cond) {
286 ins.cond = cond;
287 bit_test_single(dev, &ins, input, true, debug);
288 }
289 }
290
291 static void
292 bit_special_helper(struct panfrost_device *dev,
293 unsigned size, uint32_t *input, enum bit_debug debug)
294 {
295 bi_instruction ins = bit_ins(BI_SPECIAL, 2, nir_type_float, size);
296 uint32_t exp_input[4];
297
298 for (enum bi_special_op op = BI_SPECIAL_FRCP; op <= BI_SPECIAL_EXP2_LOW; ++op) {
299 if (op == BI_SPECIAL_EXP2_LOW) {
300 /* exp2 only supported in fp32 mode */
301 if (size != 32)
302 continue;
303
304 /* Give expected input */
305 exp_input[1] = input[0];
306 float *ff = (float *) input;
307 exp_input[0] = (int) (ff[0] * (1 << 24));
308 }
309
310 for (unsigned c = 0; c < ((size == 16) ? 2 : 1); ++c) {
311 ins.op.special = op;
312 ins.swizzle[0][0] = c;
313 bit_test_single(dev, &ins,
314 op == BI_SPECIAL_EXP2_LOW ? exp_input : input,
315 false, debug);
316 }
317 }
318 }
319
320 static void
321 bit_table_helper(struct panfrost_device *dev, uint32_t *input, enum bit_debug debug)
322 {
323 bi_instruction ins = bit_ins(BI_TABLE, 1, nir_type_float, 32);
324
325 for (enum bi_table_op op = 0; op <= BI_TABLE_LOG2_U_OVER_U_1_LOW; ++op) {
326 ins.op.table = op;
327 bit_test_single(dev, &ins, input, false, debug);
328 }
329 }
330
331 static void
332 bit_frexp_helper(struct panfrost_device *dev, uint32_t *input, enum bit_debug debug)
333 {
334 bi_instruction ins = bit_ins(BI_FREXP, 1, nir_type_float, 32);
335 ins.dest_type = nir_type_int32;
336
337 for (enum bi_frexp_op op = 0; op <= BI_FREXPE_LOG; ++op) {
338 ins.op.frexp = op;
339 bit_test_single(dev, &ins, input, true, debug);
340 }
341 }
342
343 static void
344 bit_round_helper(struct panfrost_device *dev, uint32_t *input, unsigned sz, bool FMA, enum bit_debug debug)
345 {
346 bi_instruction ins = bit_ins(BI_ROUND, 1, nir_type_float, sz);
347
348 for (enum bifrost_roundmode mode = 0; mode <= 3; ++mode) {
349 BIT_FOREACH_SWIZZLE(swz, 1, sz) {
350 bit_apply_swizzle(&ins, swz, 1, sz);
351 ins.roundmode = mode;
352 bit_test_single(dev, &ins, input, FMA, debug);
353 }
354 }
355 }
356
357 static void
358 bit_reduce_helper(struct panfrost_device *dev, uint32_t *input, enum bit_debug debug)
359 {
360 bi_instruction ins = bit_ins(BI_REDUCE_FMA, 2, nir_type_float, 32);
361
362 for (enum bi_reduce_op op = 0; op <= BI_REDUCE_ADD_FREXPM; ++op) {
363 ins.op.reduce = op;
364 bit_test_single(dev, &ins, input, true, debug);
365 }
366 }
367
368 static void
369 bit_select_helper(struct panfrost_device *dev, uint32_t *input, unsigned size, enum bit_debug debug)
370 {
371 unsigned C = 32 / size;
372 bi_instruction ins = bit_ins(BI_SELECT, C, nir_type_uint, 32);
373
374 for (unsigned c = 0; c < C; ++c)
375 ins.src_types[c] = nir_type_uint | size;
376
377 if (size == 8) {
378 /* SCHEDULER: We can only read 3 registers at once. */
379 ins.src[2] = ins.src[0];
380 }
381
382 /* Each argument has swizzle {lo, hi} so 2^C options */
383 unsigned hi = (size == 16) ? 1 : 2;
384
385 for (unsigned add = 0; add < ((size == 16) ? 2 : 1); ++add) {
386 for (unsigned swizzle = 0; swizzle < (1 << C); ++swizzle) {
387 for (unsigned i = 0; i < C; ++i)
388 ins.swizzle[i][0] = ((swizzle >> i) & 1) ? hi : 0;
389
390 bit_test_single(dev, &ins, input, !add, debug);
391 }
392 }
393 }
394
395 static void
396 bit_fcmp_helper(struct panfrost_device *dev, uint32_t *input, unsigned size, enum bit_debug debug, bool FMA)
397 {
398 bi_instruction ins = bit_ins(BI_CMP, 2, nir_type_float, size);
399 ins.dest_type = nir_type_uint | size;
400
401 /* 16-bit has swizzles and abs. 32-bit has abs/neg mods. */
402 unsigned max_mods = (size == 16) ? 64 : (size == 32) ? 16 : 1;
403
404 for (enum bi_cond cond = BI_COND_LT; cond <= BI_COND_NE; ++cond) {
405 for (unsigned mods = 0; mods < max_mods; ++mods) {
406 ins.cond = cond;
407
408 if (size == 16) {
409 for (unsigned i = 0; i < 2; ++i) {
410 ins.swizzle[i][0] = ((mods >> (i * 2)) & 1) ? 1 : 0;
411 ins.swizzle[i][1] = ((mods >> (i * 2)) & 2) ? 1 : 0;
412 }
413
414 ins.src_abs[0] = (mods & 16) ? true : false;
415 ins.src_abs[1] = (mods & 32) ? true : false;
416 } else if (size == 8) {
417 for (unsigned i = 0; i < 2; ++i) {
418 for (unsigned j = 0; j < 4; ++j)
419 ins.swizzle[i][j] = j;
420 }
421 } else if (size == 32) {
422 ins.src_abs[0] = (mods & 1) ? true : false;
423 ins.src_abs[1] = (mods & 2) ? true : false;
424 ins.src_neg[0] = (mods & 4) ? true : false;
425 ins.src_neg[1] = (mods & 8) ? true : false;
426 }
427
428 bit_test_single(dev, &ins, input, FMA, debug);
429 }
430 }
431 }
432
433 static void
434 bit_icmp_helper(struct panfrost_device *dev, uint32_t *input, unsigned size, nir_alu_type T, enum bit_debug debug)
435 {
436 bi_instruction ins = bit_ins(BI_CMP, 2, T, size);
437 ins.dest_type = nir_type_uint | size;
438
439 for (enum bi_cond cond = BI_COND_LT; cond <= BI_COND_NE; ++cond) {
440 BIT_FOREACH_SWIZZLE(swz, 2, size) {
441 ins.cond = cond;
442 bit_apply_swizzle(&ins, swz, 2, size);
443 bit_test_single(dev, &ins, input, false, debug);
444 }
445 }
446 }
447
448
449
450 static void
451 bit_convert_helper(struct panfrost_device *dev, unsigned from_size,
452 unsigned to_size, unsigned cx, unsigned cy, bool FMA,
453 enum bifrost_roundmode roundmode,
454 uint32_t *input, enum bit_debug debug)
455 {
456 bi_instruction ins = {
457 .type = BI_CONVERT,
458 .dest = BIR_INDEX_REGISTER | 0,
459 .src = { BIR_INDEX_REGISTER | 0 }
460 };
461
462 nir_alu_type Ts[3] = { nir_type_float, nir_type_uint, nir_type_int };
463
464 for (unsigned from_base = 0; from_base < 3; ++from_base) {
465 for (unsigned to_base = 0; to_base < 3; ++to_base) {
466 /* Discard invalid combinations.. */
467 if ((from_size == to_size) && (from_base == to_base))
468 continue;
469
470 /* Can't switch signedness */
471 if (from_base && to_base)
472 continue;
473
474 /* No F16_TO_I32, etc */
475 if (from_size != to_size && from_base == 0 && to_base)
476 continue;
477
478 if (from_size != to_size && from_base && to_base == 0)
479 continue;
480
481 /* No need, just ignore the upper half */
482 if (from_size > to_size && from_base == to_base && from_base)
483 continue;
484
485 ins.dest_type = Ts[to_base] | to_size;
486 ins.src_types[0] = Ts[from_base] | from_size;
487 ins.roundmode = roundmode;
488 ins.swizzle[0][0] = cx;
489 ins.swizzle[0][1] = cy;
490
491 bit_test_single(dev, &ins, input, FMA, debug);
492 }
493 }
494 }
495
496 static void
497 bit_constant_helper(struct panfrost_device *dev,
498 uint32_t *input, enum bit_debug debug)
499 {
500 enum bi_class C[3] = { BI_MOV, BI_ADD, BI_FMA };
501
502 for (unsigned doubled = 0; doubled < 2; ++doubled) {
503 for (unsigned count = 1; count <= 3; ++count) {
504 bi_instruction ins = bit_ins(C[count - 1], count, nir_type_float, 32);
505
506 ins.src[0] = BIR_INDEX_CONSTANT | 0;
507 ins.src[1] = (count >= 2) ? BIR_INDEX_CONSTANT | (doubled ? 32 : 0) : 0;
508 ins.src[2] = (count >= 3) ? BIR_INDEX_ZERO : 0;
509
510 ins.constant.u64 = doubled ?
511 0x3f800000ull | (0x3f000000ull << 32ull) :
512 0x3f800000ull;
513
514 bit_test_single(dev, &ins, input, true, debug);
515 }
516 }
517 }
518
519 static void
520 bit_bitwise_helper(struct panfrost_device *dev, uint32_t *input, unsigned size, enum bit_debug debug)
521 {
522 bi_instruction ins = bit_ins(BI_BITWISE, 3, nir_type_uint, size);
523
524 /* TODO: shifts */
525 ins.src[2] = BIR_INDEX_ZERO;
526
527 /* Force identity swizzle -- bitwise is not swizzleable */
528 for (unsigned i = 0; i < 2; ++i) {
529 for (unsigned j = 0; j < (32 / size); ++j)
530 ins.swizzle[i][j] = j;
531 }
532
533 for (unsigned op = BI_BITWISE_AND; op <= BI_BITWISE_XOR; ++op) {
534 ins.op.bitwise = op;
535
536 for (unsigned mods = 0; mods < 4; ++mods) {
537 ins.bitwise.src_invert[0] = mods & 1;
538 ins.bitwise.src_invert[1] = mods & 2;
539 bit_test_single(dev, &ins, input, true, debug);
540 }
541 }
542 }
543
544 void
545 bit_packing(struct panfrost_device *dev, enum bit_debug debug)
546 {
547 float input32[4];
548 uint16_t input16[8];
549
550 bit_generate_float4(input32);
551 bit_generate_half8(input16);
552
553 bit_constant_helper(dev, (uint32_t *) input32, debug);
554
555 for (unsigned sz = 16; sz <= 32; sz *= 2) {
556 uint32_t *input =
557 (sz == 16) ? (uint32_t *) input16 :
558 (uint32_t *) input32;
559
560 bit_fmod_helper(dev, BI_ADD, sz, true, input, debug, 0);
561 bit_fmod_helper(dev, BI_ADD, sz, false, input, debug, 0);
562 bit_round_helper(dev, (uint32_t *) input32, sz, true, debug);
563
564 bit_fmod_helper(dev, BI_MINMAX, sz, false, input, debug, BI_MINMAX_MIN);
565 bit_fmod_helper(dev, BI_MINMAX, sz, false, input, debug, BI_MINMAX_MAX);
566
567 bit_fma_helper(dev, sz, input, debug);
568 bit_icmp_helper(dev, input, sz, nir_type_uint, debug);
569 bit_icmp_helper(dev, input, sz, nir_type_int, debug);
570 }
571
572 for (unsigned sz = 32; sz <= 32; sz *= 2)
573 bit_csel_helper(dev, sz, (uint32_t *) input32, debug);
574
575 float special[4] = { 0.9 };
576 uint32_t special16[4] = { _mesa_float_to_half(special[0]) | (_mesa_float_to_half(0.2) << 16) };
577
578 bit_table_helper(dev, (uint32_t *) special, debug);
579
580 for (unsigned sz = 16; sz <= 32; sz *= 2) {
581 uint32_t *input =
582 (sz == 16) ? special16 :
583 (uint32_t *) special;
584
585 bit_special_helper(dev, sz, input, debug);
586 }
587
588 for (unsigned rm = 0; rm < 4; ++rm) {
589 bit_convert_helper(dev, 32, 32, 0, 0, false, rm, (uint32_t *) input32, debug);
590
591 for (unsigned c = 0; c < 2; ++c)
592 bit_convert_helper(dev, 32, 16, c, 0, false, rm, (uint32_t *) input32, debug);
593
594 bit_convert_helper(dev, 16, 32, 0, 0, false, rm, (uint32_t *) input16, debug);
595
596 for (unsigned c = 0; c < 4; ++c)
597 bit_convert_helper(dev, 16, 16, c & 1, c >> 1, false, rm, (uint32_t *) input16, debug);
598 }
599
600 bit_frexp_helper(dev, (uint32_t *) input32, debug);
601 bit_reduce_helper(dev, (uint32_t *) input32, debug);
602
603 uint32_t mscale_input[4];
604 memcpy(mscale_input, input32, sizeof(input32));
605 mscale_input[3] = 0x7;
606 bit_fma_mscale_helper(dev, mscale_input, debug);
607
608 for (unsigned sz = 8; sz <= 16; sz *= 2) {
609 bit_select_helper(dev, (uint32_t *) input32, sz, debug);
610 }
611
612 bit_fcmp_helper(dev, (uint32_t *) input32, 32, debug, true);
613 bit_fcmp_helper(dev, (uint32_t *) input32, 16, debug, true);
614
615 for (unsigned sz = 8; sz <= 32; sz *= 2)
616 bit_bitwise_helper(dev, (uint32_t *) input32, sz, debug);
617 }