r600g: Check comp_mask before merging export instructions
[mesa.git] / src / gallium / drivers / r600 / r600_asm.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_opcodes.h"
25 #include "r600_formats.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include <errno.h>
30 #include <byteswap.h>
31 #include "util/u_memory.h"
32 #include "pipe/p_shader_tokens.h"
33
34 #define NUM_OF_CYCLES 3
35 #define NUM_OF_COMPONENTS 4
36
37 static inline unsigned int r600_bytecode_get_num_operands(
38 struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
39 {
40 return r600_isa_alu(alu->op)->src_count;
41 }
42
43 int r700_bytecode_alu_build(struct r600_bytecode *bc,
44 struct r600_bytecode_alu *alu, unsigned id);
45
46 static struct r600_bytecode_cf *r600_bytecode_cf(void)
47 {
48 struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
49
50 if (cf == NULL)
51 return NULL;
52 LIST_INITHEAD(&cf->list);
53 LIST_INITHEAD(&cf->alu);
54 LIST_INITHEAD(&cf->vtx);
55 LIST_INITHEAD(&cf->tex);
56 return cf;
57 }
58
59 static struct r600_bytecode_alu *r600_bytecode_alu(void)
60 {
61 struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
62
63 if (alu == NULL)
64 return NULL;
65 LIST_INITHEAD(&alu->list);
66 return alu;
67 }
68
69 static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
70 {
71 struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
72
73 if (vtx == NULL)
74 return NULL;
75 LIST_INITHEAD(&vtx->list);
76 return vtx;
77 }
78
79 static struct r600_bytecode_tex *r600_bytecode_tex(void)
80 {
81 struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
82
83 if (tex == NULL)
84 return NULL;
85 LIST_INITHEAD(&tex->list);
86 return tex;
87 }
88
89 void r600_bytecode_init(struct r600_bytecode *bc,
90 enum chip_class chip_class,
91 enum radeon_family family,
92 enum r600_msaa_texture_mode msaa_texture_mode)
93 {
94 if ((chip_class == R600) &&
95 (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) {
96 bc->ar_handling = AR_HANDLE_RV6XX;
97 bc->r6xx_nop_after_rel_dst = 1;
98 } else {
99 bc->ar_handling = AR_HANDLE_NORMAL;
100 bc->r6xx_nop_after_rel_dst = 0;
101 }
102
103 LIST_INITHEAD(&bc->cf);
104 bc->chip_class = chip_class;
105 bc->msaa_texture_mode = msaa_texture_mode;
106 }
107
108 static int r600_bytecode_add_cf(struct r600_bytecode *bc)
109 {
110 struct r600_bytecode_cf *cf = r600_bytecode_cf();
111
112 if (cf == NULL)
113 return -ENOMEM;
114 LIST_ADDTAIL(&cf->list, &bc->cf);
115 if (bc->cf_last) {
116 cf->id = bc->cf_last->id + 2;
117 if (bc->cf_last->eg_alu_extended) {
118 /* take into account extended alu size */
119 cf->id += 2;
120 bc->ndw += 2;
121 }
122 }
123 bc->cf_last = cf;
124 bc->ncf++;
125 bc->ndw += 2;
126 bc->force_add_cf = 0;
127 bc->ar_loaded = 0;
128 return 0;
129 }
130
131 int r600_bytecode_add_output(struct r600_bytecode *bc,
132 const struct r600_bytecode_output *output)
133 {
134 int r;
135
136 if (output->gpr >= bc->ngpr)
137 bc->ngpr = output->gpr + 1;
138
139 if (bc->cf_last && (bc->cf_last->op == output->op ||
140 (bc->cf_last->op == CF_OP_EXPORT &&
141 output->op == CF_OP_EXPORT_DONE)) &&
142 output->type == bc->cf_last->output.type &&
143 output->elem_size == bc->cf_last->output.elem_size &&
144 output->swizzle_x == bc->cf_last->output.swizzle_x &&
145 output->swizzle_y == bc->cf_last->output.swizzle_y &&
146 output->swizzle_z == bc->cf_last->output.swizzle_z &&
147 output->swizzle_w == bc->cf_last->output.swizzle_w &&
148 output->comp_mask == bc->cf_last->output.comp_mask &&
149 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
150
151 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
152 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
153
154 bc->cf_last->output.end_of_program |= output->end_of_program;
155 bc->cf_last->op = bc->cf_last->output.op = output->op;
156 bc->cf_last->output.gpr = output->gpr;
157 bc->cf_last->output.array_base = output->array_base;
158 bc->cf_last->output.burst_count += output->burst_count;
159 return 0;
160
161 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
162 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
163
164 bc->cf_last->output.end_of_program |= output->end_of_program;
165 bc->cf_last->op = bc->cf_last->output.op = output->op;
166 bc->cf_last->output.burst_count += output->burst_count;
167 return 0;
168 }
169 }
170
171 r = r600_bytecode_add_cf(bc);
172 if (r)
173 return r;
174 bc->cf_last->op = output->op;
175 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
176 return 0;
177 }
178
179 /* alu instructions that can ony exits once per group */
180 static int is_alu_once_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
181 {
182 return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED);
183 }
184
185 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
186 {
187 return (r600_isa_alu(alu->op)->flags & AF_REPL) &&
188 (r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V);
189 }
190
191 static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
192 {
193 return r600_isa_alu(alu->op)->flags & AF_MOVA;
194 }
195
196 static int alu_uses_rel(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
197 {
198 unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
199 unsigned src;
200
201 if (alu->dst.rel) {
202 return 1;
203 }
204
205 for (src = 0; src < num_src; ++src) {
206 if (alu->src[src].rel) {
207 return 1;
208 }
209 }
210 return 0;
211 }
212
213 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
214 {
215 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
216 return !(slots & AF_S);
217 }
218
219 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
220 {
221 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
222 return !(slots & AF_V);
223 }
224
225 /* alu instructions that can execute on any unit */
226 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
227 {
228 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
229 return slots == AF_VS;
230 }
231
232 static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
233 {
234 return alu->op == ALU_OP0_NOP;
235 }
236
237 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
238 struct r600_bytecode_alu *assignment[5])
239 {
240 struct r600_bytecode_alu *alu;
241 unsigned i, chan, trans;
242 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
243
244 for (i = 0; i < max_slots; i++)
245 assignment[i] = NULL;
246
247 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) {
248 chan = alu->dst.chan;
249 if (max_slots == 4)
250 trans = 0;
251 else if (is_alu_trans_unit_inst(bc, alu))
252 trans = 1;
253 else if (is_alu_vec_unit_inst(bc, alu))
254 trans = 0;
255 else if (assignment[chan])
256 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
257 else
258 trans = 0;
259
260 if (trans) {
261 if (assignment[4]) {
262 assert(0); /* ALU.Trans has already been allocated. */
263 return -1;
264 }
265 assignment[4] = alu;
266 } else {
267 if (assignment[chan]) {
268 assert(0); /* ALU.chan has already been allocated. */
269 return -1;
270 }
271 assignment[chan] = alu;
272 }
273
274 if (alu->last)
275 break;
276 }
277 return 0;
278 }
279
280 struct alu_bank_swizzle {
281 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
282 int hw_cfile_addr[4];
283 int hw_cfile_elem[4];
284 };
285
286 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
287 [SQ_ALU_VEC_012] = { 0, 1, 2 },
288 [SQ_ALU_VEC_021] = { 0, 2, 1 },
289 [SQ_ALU_VEC_120] = { 1, 2, 0 },
290 [SQ_ALU_VEC_102] = { 1, 0, 2 },
291 [SQ_ALU_VEC_201] = { 2, 0, 1 },
292 [SQ_ALU_VEC_210] = { 2, 1, 0 }
293 };
294
295 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
296 [SQ_ALU_SCL_210] = { 2, 1, 0 },
297 [SQ_ALU_SCL_122] = { 1, 2, 2 },
298 [SQ_ALU_SCL_212] = { 2, 1, 2 },
299 [SQ_ALU_SCL_221] = { 2, 2, 1 }
300 };
301
302 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
303 {
304 int i, cycle, component;
305 /* set up gpr use */
306 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
307 for (component = 0; component < NUM_OF_COMPONENTS; component++)
308 bs->hw_gpr[cycle][component] = -1;
309 for (i = 0; i < 4; i++)
310 bs->hw_cfile_addr[i] = -1;
311 for (i = 0; i < 4; i++)
312 bs->hw_cfile_elem[i] = -1;
313 }
314
315 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
316 {
317 if (bs->hw_gpr[cycle][chan] == -1)
318 bs->hw_gpr[cycle][chan] = sel;
319 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
320 /* Another scalar operation has already used the GPR read port for the channel. */
321 return -1;
322 }
323 return 0;
324 }
325
326 static int reserve_cfile(struct r600_bytecode *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
327 {
328 int res, num_res = 4;
329 if (bc->chip_class >= R700) {
330 num_res = 2;
331 chan /= 2;
332 }
333 for (res = 0; res < num_res; ++res) {
334 if (bs->hw_cfile_addr[res] == -1) {
335 bs->hw_cfile_addr[res] = sel;
336 bs->hw_cfile_elem[res] = chan;
337 return 0;
338 } else if (bs->hw_cfile_addr[res] == sel &&
339 bs->hw_cfile_elem[res] == chan)
340 return 0; /* Read for this scalar element already reserved, nothing to do here. */
341 }
342 /* All cfile read ports are used, cannot reference vector element. */
343 return -1;
344 }
345
346 static int is_gpr(unsigned sel)
347 {
348 return (sel >= 0 && sel <= 127);
349 }
350
351 /* CB constants start at 512, and get translated to a kcache index when ALU
352 * clauses are constructed. Note that we handle kcache constants the same way
353 * as (the now gone) cfile constants, is that really required? */
354 static int is_cfile(unsigned sel)
355 {
356 return (sel > 255 && sel < 512) ||
357 (sel > 511 && sel < 4607) || /* Kcache before translation. */
358 (sel > 127 && sel < 192); /* Kcache after translation. */
359 }
360
361 static int is_const(int sel)
362 {
363 return is_cfile(sel) ||
364 (sel >= V_SQ_ALU_SRC_0 &&
365 sel <= V_SQ_ALU_SRC_LITERAL);
366 }
367
368 static int check_vector(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
369 struct alu_bank_swizzle *bs, int bank_swizzle)
370 {
371 int r, src, num_src, sel, elem, cycle;
372
373 num_src = r600_bytecode_get_num_operands(bc, alu);
374 for (src = 0; src < num_src; src++) {
375 sel = alu->src[src].sel;
376 elem = alu->src[src].chan;
377 if (is_gpr(sel)) {
378 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
379 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
380 /* Nothing to do; special-case optimization,
381 * second source uses first source’s reservation. */
382 continue;
383 else {
384 r = reserve_gpr(bs, sel, elem, cycle);
385 if (r)
386 return r;
387 }
388 } else if (is_cfile(sel)) {
389 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
390 if (r)
391 return r;
392 }
393 /* No restrictions on PV, PS, literal or special constants. */
394 }
395 return 0;
396 }
397
398 static int check_scalar(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
399 struct alu_bank_swizzle *bs, int bank_swizzle)
400 {
401 int r, src, num_src, const_count, sel, elem, cycle;
402
403 num_src = r600_bytecode_get_num_operands(bc, alu);
404 for (const_count = 0, src = 0; src < num_src; ++src) {
405 sel = alu->src[src].sel;
406 elem = alu->src[src].chan;
407 if (is_const(sel)) { /* Any constant, including literal and inline constants. */
408 if (const_count >= 2)
409 /* More than two references to a constant in
410 * transcendental operation. */
411 return -1;
412 else
413 const_count++;
414 }
415 if (is_cfile(sel)) {
416 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
417 if (r)
418 return r;
419 }
420 }
421 for (src = 0; src < num_src; ++src) {
422 sel = alu->src[src].sel;
423 elem = alu->src[src].chan;
424 if (is_gpr(sel)) {
425 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
426 if (cycle < const_count)
427 /* Cycle for GPR load conflicts with
428 * constant load in transcendental operation. */
429 return -1;
430 r = reserve_gpr(bs, sel, elem, cycle);
431 if (r)
432 return r;
433 }
434 /* PV PS restrictions */
435 if (const_count && (sel == 254 || sel == 255)) {
436 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
437 if (cycle < const_count)
438 return -1;
439 }
440 }
441 return 0;
442 }
443
444 static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
445 struct r600_bytecode_alu *slots[5])
446 {
447 struct alu_bank_swizzle bs;
448 int bank_swizzle[5];
449 int i, r = 0, forced = 1;
450 boolean scalar_only = bc->chip_class == CAYMAN ? false : true;
451 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
452
453 for (i = 0; i < max_slots; i++) {
454 if (slots[i]) {
455 if (slots[i]->bank_swizzle_force) {
456 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
457 } else {
458 forced = 0;
459 }
460 }
461
462 if (i < 4 && slots[i])
463 scalar_only = false;
464 }
465 if (forced)
466 return 0;
467
468 /* Just check every possible combination of bank swizzle.
469 * Not very efficent, but works on the first try in most of the cases. */
470 for (i = 0; i < 4; i++)
471 if (!slots[i] || !slots[i]->bank_swizzle_force)
472 bank_swizzle[i] = SQ_ALU_VEC_012;
473 else
474 bank_swizzle[i] = slots[i]->bank_swizzle;
475
476 bank_swizzle[4] = SQ_ALU_SCL_210;
477 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
478
479 init_bank_swizzle(&bs);
480 if (scalar_only == false) {
481 for (i = 0; i < 4; i++) {
482 if (slots[i]) {
483 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
484 if (r)
485 break;
486 }
487 }
488 } else
489 r = 0;
490
491 if (!r && slots[4] && max_slots == 5) {
492 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
493 }
494 if (!r) {
495 for (i = 0; i < max_slots; i++) {
496 if (slots[i])
497 slots[i]->bank_swizzle = bank_swizzle[i];
498 }
499 return 0;
500 }
501
502 if (scalar_only) {
503 bank_swizzle[4]++;
504 } else {
505 for (i = 0; i < max_slots; i++) {
506 if (!slots[i] || !slots[i]->bank_swizzle_force) {
507 bank_swizzle[i]++;
508 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
509 break;
510 else if (i < max_slots - 1)
511 bank_swizzle[i] = SQ_ALU_VEC_012;
512 else
513 return -1;
514 }
515 }
516 }
517 }
518
519 /* Couldn't find a working swizzle. */
520 return -1;
521 }
522
523 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
524 struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
525 {
526 struct r600_bytecode_alu *prev[5];
527 int gpr[5], chan[5];
528 int i, j, r, src, num_src;
529 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
530
531 r = assign_alu_units(bc, alu_prev, prev);
532 if (r)
533 return r;
534
535 for (i = 0; i < max_slots; ++i) {
536 if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
537 gpr[i] = prev[i]->dst.sel;
538 /* cube writes more than PV.X */
539 if (is_alu_reduction_inst(bc, prev[i]))
540 chan[i] = 0;
541 else
542 chan[i] = prev[i]->dst.chan;
543 } else
544 gpr[i] = -1;
545 }
546
547 for (i = 0; i < max_slots; ++i) {
548 struct r600_bytecode_alu *alu = slots[i];
549 if(!alu)
550 continue;
551
552 num_src = r600_bytecode_get_num_operands(bc, alu);
553 for (src = 0; src < num_src; ++src) {
554 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
555 continue;
556
557 if (bc->chip_class < CAYMAN) {
558 if (alu->src[src].sel == gpr[4] &&
559 alu->src[src].chan == chan[4] &&
560 alu_prev->pred_sel == alu->pred_sel) {
561 alu->src[src].sel = V_SQ_ALU_SRC_PS;
562 alu->src[src].chan = 0;
563 continue;
564 }
565 }
566
567 for (j = 0; j < 4; ++j) {
568 if (alu->src[src].sel == gpr[j] &&
569 alu->src[src].chan == j &&
570 alu_prev->pred_sel == alu->pred_sel) {
571 alu->src[src].sel = V_SQ_ALU_SRC_PV;
572 alu->src[src].chan = chan[j];
573 break;
574 }
575 }
576 }
577 }
578
579 return 0;
580 }
581
582 void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
583 {
584 switch(value) {
585 case 0:
586 *sel = V_SQ_ALU_SRC_0;
587 break;
588 case 1:
589 *sel = V_SQ_ALU_SRC_1_INT;
590 break;
591 case -1:
592 *sel = V_SQ_ALU_SRC_M_1_INT;
593 break;
594 case 0x3F800000: /* 1.0f */
595 *sel = V_SQ_ALU_SRC_1;
596 break;
597 case 0x3F000000: /* 0.5f */
598 *sel = V_SQ_ALU_SRC_0_5;
599 break;
600 case 0xBF800000: /* -1.0f */
601 *sel = V_SQ_ALU_SRC_1;
602 *neg ^= 1;
603 break;
604 case 0xBF000000: /* -0.5f */
605 *sel = V_SQ_ALU_SRC_0_5;
606 *neg ^= 1;
607 break;
608 default:
609 *sel = V_SQ_ALU_SRC_LITERAL;
610 break;
611 }
612 }
613
614 /* compute how many literal are needed */
615 static int r600_bytecode_alu_nliterals(struct r600_bytecode *bc, struct r600_bytecode_alu *alu,
616 uint32_t literal[4], unsigned *nliteral)
617 {
618 unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
619 unsigned i, j;
620
621 for (i = 0; i < num_src; ++i) {
622 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
623 uint32_t value = alu->src[i].value;
624 unsigned found = 0;
625 for (j = 0; j < *nliteral; ++j) {
626 if (literal[j] == value) {
627 found = 1;
628 break;
629 }
630 }
631 if (!found) {
632 if (*nliteral >= 4)
633 return -EINVAL;
634 literal[(*nliteral)++] = value;
635 }
636 }
637 }
638 return 0;
639 }
640
641 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode *bc,
642 struct r600_bytecode_alu *alu,
643 uint32_t literal[4], unsigned nliteral)
644 {
645 unsigned num_src = r600_bytecode_get_num_operands(bc, alu);
646 unsigned i, j;
647
648 for (i = 0; i < num_src; ++i) {
649 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
650 uint32_t value = alu->src[i].value;
651 for (j = 0; j < nliteral; ++j) {
652 if (literal[j] == value) {
653 alu->src[i].chan = j;
654 break;
655 }
656 }
657 }
658 }
659 }
660
661 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
662 struct r600_bytecode_alu *alu_prev)
663 {
664 struct r600_bytecode_alu *prev[5];
665 struct r600_bytecode_alu *result[5] = { NULL };
666
667 uint32_t literal[4], prev_literal[4];
668 unsigned nliteral = 0, prev_nliteral = 0;
669
670 int i, j, r, src, num_src;
671 int num_once_inst = 0;
672 int have_mova = 0, have_rel = 0;
673 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
674
675 r = assign_alu_units(bc, alu_prev, prev);
676 if (r)
677 return r;
678
679 for (i = 0; i < max_slots; ++i) {
680 if (prev[i]) {
681 if (prev[i]->pred_sel)
682 return 0;
683 if (is_alu_once_inst(bc, prev[i]))
684 return 0;
685 }
686 if (slots[i]) {
687 if (slots[i]->pred_sel)
688 return 0;
689 if (is_alu_once_inst(bc, slots[i]))
690 return 0;
691 }
692 }
693
694 for (i = 0; i < max_slots; ++i) {
695 struct r600_bytecode_alu *alu;
696
697 if (num_once_inst > 0)
698 return 0;
699
700 /* check number of literals */
701 if (prev[i]) {
702 if (r600_bytecode_alu_nliterals(bc, prev[i], literal, &nliteral))
703 return 0;
704 if (r600_bytecode_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
705 return 0;
706 if (is_alu_mova_inst(bc, prev[i])) {
707 if (have_rel)
708 return 0;
709 have_mova = 1;
710 }
711
712 if (alu_uses_rel(bc, prev[i])) {
713 if (have_mova) {
714 return 0;
715 }
716 have_rel = 1;
717 }
718
719 num_once_inst += is_alu_once_inst(bc, prev[i]);
720 }
721 if (slots[i] && r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral))
722 return 0;
723
724 /* Let's check used slots. */
725 if (prev[i] && !slots[i]) {
726 result[i] = prev[i];
727 continue;
728 } else if (prev[i] && slots[i]) {
729 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
730 /* Trans unit is still free try to use it. */
731 if (is_alu_any_unit_inst(bc, slots[i])) {
732 result[i] = prev[i];
733 result[4] = slots[i];
734 } else if (is_alu_any_unit_inst(bc, prev[i])) {
735 if (slots[i]->dst.sel == prev[i]->dst.sel &&
736 (slots[i]->dst.write == 1 || slots[i]->is_op3) &&
737 (prev[i]->dst.write == 1 || prev[i]->is_op3))
738 return 0;
739
740 result[i] = slots[i];
741 result[4] = prev[i];
742 } else
743 return 0;
744 } else
745 return 0;
746 } else if(!slots[i]) {
747 continue;
748 } else {
749 if (max_slots == 5 && slots[i] && prev[4] &&
750 slots[i]->dst.sel == prev[4]->dst.sel &&
751 slots[i]->dst.chan == prev[4]->dst.chan &&
752 (slots[i]->dst.write == 1 || slots[i]->is_op3) &&
753 (prev[4]->dst.write == 1 || prev[4]->is_op3))
754 return 0;
755
756 result[i] = slots[i];
757 }
758
759 alu = slots[i];
760 num_once_inst += is_alu_once_inst(bc, alu);
761
762 /* don't reschedule NOPs */
763 if (is_nop_inst(bc, alu))
764 return 0;
765
766 if (is_alu_mova_inst(bc, alu)) {
767 if (have_rel) {
768 return 0;
769 }
770 have_mova = 1;
771 }
772
773 if (alu_uses_rel(bc, alu)) {
774 if (have_mova) {
775 return 0;
776 }
777 have_rel = 1;
778 }
779
780 /* Let's check source gprs */
781 num_src = r600_bytecode_get_num_operands(bc, alu);
782 for (src = 0; src < num_src; ++src) {
783
784 /* Constants don't matter. */
785 if (!is_gpr(alu->src[src].sel))
786 continue;
787
788 for (j = 0; j < max_slots; ++j) {
789 if (!prev[j] || !(prev[j]->dst.write || prev[j]->is_op3))
790 continue;
791
792 /* If it's relative then we can't determin which gpr is really used. */
793 if (prev[j]->dst.chan == alu->src[src].chan &&
794 (prev[j]->dst.sel == alu->src[src].sel ||
795 prev[j]->dst.rel || alu->src[src].rel))
796 return 0;
797 }
798 }
799 }
800
801 /* more than one PRED_ or KILL_ ? */
802 if (num_once_inst > 1)
803 return 0;
804
805 /* check if the result can still be swizzlet */
806 r = check_and_set_bank_swizzle(bc, result);
807 if (r)
808 return 0;
809
810 /* looks like everything worked out right, apply the changes */
811
812 /* undo adding previus literals */
813 bc->cf_last->ndw -= align(prev_nliteral, 2);
814
815 /* sort instructions */
816 for (i = 0; i < max_slots; ++i) {
817 slots[i] = result[i];
818 if (result[i]) {
819 LIST_DEL(&result[i]->list);
820 result[i]->last = 0;
821 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
822 }
823 }
824
825 /* determine new last instruction */
826 LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1;
827
828 /* determine new first instruction */
829 for (i = 0; i < max_slots; ++i) {
830 if (result[i]) {
831 bc->cf_last->curr_bs_head = result[i];
832 break;
833 }
834 }
835
836 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
837 bc->cf_last->prev2_bs_head = NULL;
838
839 return 0;
840 }
841
842 /* we'll keep kcache sets sorted by bank & addr */
843 static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
844 struct r600_bytecode_kcache *kcache,
845 unsigned bank, unsigned line)
846 {
847 int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2;
848
849 for (i = 0; i < kcache_banks; i++) {
850 if (kcache[i].mode) {
851 int d;
852
853 if (kcache[i].bank < bank)
854 continue;
855
856 if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
857 kcache[i].bank > bank) {
858 /* try to insert new line */
859 if (kcache[kcache_banks-1].mode) {
860 /* all sets are in use */
861 return -ENOMEM;
862 }
863
864 memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
865 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
866 kcache[i].bank = bank;
867 kcache[i].addr = line;
868 return 0;
869 }
870
871 d = line - kcache[i].addr;
872
873 if (d == -1) {
874 kcache[i].addr--;
875 if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
876 /* we are prepending the line to the current set,
877 * discarding the existing second line,
878 * so we'll have to insert line+2 after it */
879 line += 2;
880 continue;
881 } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
882 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
883 return 0;
884 } else {
885 /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
886 return -ENOMEM;
887 }
888 } else if (d == 1) {
889 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
890 return 0;
891 } else if (d == 0)
892 return 0;
893 } else { /* free kcache set - use it */
894 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
895 kcache[i].bank = bank;
896 kcache[i].addr = line;
897 return 0;
898 }
899 }
900 return -ENOMEM;
901 }
902
903 static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
904 struct r600_bytecode_kcache *kcache,
905 struct r600_bytecode_alu *alu)
906 {
907 int i, r;
908
909 for (i = 0; i < 3; i++) {
910 unsigned bank, line, sel = alu->src[i].sel;
911
912 if (sel < 512)
913 continue;
914
915 bank = alu->src[i].kc_bank;
916 line = (sel-512)>>4;
917
918 if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line)))
919 return r;
920 }
921 return 0;
922 }
923
924 static int r600_bytecode_assign_kcache_banks(struct r600_bytecode *bc,
925 struct r600_bytecode_alu *alu,
926 struct r600_bytecode_kcache * kcache)
927 {
928 int i, j;
929
930 /* Alter the src operands to refer to the kcache. */
931 for (i = 0; i < 3; ++i) {
932 static const unsigned int base[] = {128, 160, 256, 288};
933 unsigned int line, sel = alu->src[i].sel, found = 0;
934
935 if (sel < 512)
936 continue;
937
938 sel -= 512;
939 line = sel>>4;
940
941 for (j = 0; j < 4 && !found; ++j) {
942 switch (kcache[j].mode) {
943 case V_SQ_CF_KCACHE_NOP:
944 case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
945 R600_ERR("unexpected kcache line mode\n");
946 return -ENOMEM;
947 default:
948 if (kcache[j].bank == alu->src[i].kc_bank &&
949 kcache[j].addr <= line &&
950 line < kcache[j].addr + kcache[j].mode) {
951 alu->src[i].sel = sel - (kcache[j].addr<<4);
952 alu->src[i].sel += base[j];
953 found=1;
954 }
955 }
956 }
957 }
958 return 0;
959 }
960
961 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc,
962 struct r600_bytecode_alu *alu,
963 unsigned type)
964 {
965 struct r600_bytecode_kcache kcache_sets[4];
966 struct r600_bytecode_kcache *kcache = kcache_sets;
967 int r;
968
969 memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
970
971 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
972 /* can't alloc, need to start new clause */
973 if ((r = r600_bytecode_add_cf(bc))) {
974 return r;
975 }
976 bc->cf_last->op = type;
977
978 /* retry with the new clause */
979 kcache = bc->cf_last->kcache;
980 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
981 /* can't alloc again- should never happen */
982 return r;
983 }
984 } else {
985 /* update kcache sets */
986 memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
987 }
988
989 /* if we actually used more than 2 kcache sets - use ALU_EXTENDED on eg+ */
990 if (kcache[2].mode != V_SQ_CF_KCACHE_NOP) {
991 if (bc->chip_class < EVERGREEN)
992 return -ENOMEM;
993 bc->cf_last->eg_alu_extended = 1;
994 }
995
996 return 0;
997 }
998
999 static int insert_nop_r6xx(struct r600_bytecode *bc)
1000 {
1001 struct r600_bytecode_alu alu;
1002 int r, i;
1003
1004 for (i = 0; i < 4; i++) {
1005 memset(&alu, 0, sizeof(alu));
1006 alu.op = ALU_OP0_NOP;
1007 alu.src[0].chan = i;
1008 alu.dst.chan = i;
1009 alu.last = (i == 3);
1010 r = r600_bytecode_add_alu(bc, &alu);
1011 if (r)
1012 return r;
1013 }
1014 return 0;
1015 }
1016
1017 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1018 static int load_ar_r6xx(struct r600_bytecode *bc)
1019 {
1020 struct r600_bytecode_alu alu;
1021 int r;
1022
1023 if (bc->ar_loaded)
1024 return 0;
1025
1026 /* hack to avoid making MOVA the last instruction in the clause */
1027 if ((bc->cf_last->ndw>>1) >= 110)
1028 bc->force_add_cf = 1;
1029
1030 memset(&alu, 0, sizeof(alu));
1031 alu.op = ALU_OP1_MOVA_GPR_INT;
1032 alu.src[0].sel = bc->ar_reg;
1033 alu.src[0].chan = bc->ar_chan;
1034 alu.last = 1;
1035 alu.index_mode = INDEX_MODE_LOOP;
1036 r = r600_bytecode_add_alu(bc, &alu);
1037 if (r)
1038 return r;
1039
1040 /* no requirement to set uses waterfall on MOVA_GPR_INT */
1041 bc->ar_loaded = 1;
1042 return 0;
1043 }
1044
1045 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
1046 static int load_ar(struct r600_bytecode *bc)
1047 {
1048 struct r600_bytecode_alu alu;
1049 int r;
1050
1051 if (bc->ar_handling)
1052 return load_ar_r6xx(bc);
1053
1054 if (bc->ar_loaded)
1055 return 0;
1056
1057 /* hack to avoid making MOVA the last instruction in the clause */
1058 if ((bc->cf_last->ndw>>1) >= 110)
1059 bc->force_add_cf = 1;
1060
1061 memset(&alu, 0, sizeof(alu));
1062 alu.op = ALU_OP1_MOVA_INT;
1063 alu.src[0].sel = bc->ar_reg;
1064 alu.src[0].chan = bc->ar_chan;
1065 alu.last = 1;
1066 r = r600_bytecode_add_alu(bc, &alu);
1067 if (r)
1068 return r;
1069
1070 bc->cf_last->r6xx_uses_waterfall = 1;
1071 bc->ar_loaded = 1;
1072 return 0;
1073 }
1074
1075 int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
1076 const struct r600_bytecode_alu *alu, unsigned type)
1077 {
1078 struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1079 struct r600_bytecode_alu *lalu;
1080 int i, r;
1081
1082 if (nalu == NULL)
1083 return -ENOMEM;
1084 memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1085
1086 if (bc->cf_last != NULL && bc->cf_last->op != type) {
1087 /* check if we could add it anyway */
1088 if (bc->cf_last->op == CF_OP_ALU &&
1089 type == CF_OP_ALU_PUSH_BEFORE) {
1090 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1091 if (lalu->execute_mask) {
1092 bc->force_add_cf = 1;
1093 break;
1094 }
1095 }
1096 } else
1097 bc->force_add_cf = 1;
1098 }
1099
1100 /* cf can contains only alu or only vtx or only tex */
1101 if (bc->cf_last == NULL || bc->force_add_cf) {
1102 r = r600_bytecode_add_cf(bc);
1103 if (r) {
1104 free(nalu);
1105 return r;
1106 }
1107 }
1108 bc->cf_last->op = type;
1109
1110 /* Check AR usage and load it if required */
1111 for (i = 0; i < 3; i++)
1112 if (nalu->src[i].rel && !bc->ar_loaded)
1113 load_ar(bc);
1114
1115 if (nalu->dst.rel && !bc->ar_loaded)
1116 load_ar(bc);
1117
1118 /* Setup the kcache for this ALU instruction. This will start a new
1119 * ALU clause if needed. */
1120 if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1121 free(nalu);
1122 return r;
1123 }
1124
1125 if (!bc->cf_last->curr_bs_head) {
1126 bc->cf_last->curr_bs_head = nalu;
1127 }
1128 /* number of gpr == the last gpr used in any alu */
1129 for (i = 0; i < 3; i++) {
1130 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1131 bc->ngpr = nalu->src[i].sel + 1;
1132 }
1133 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1134 r600_bytecode_special_constants(nalu->src[i].value,
1135 &nalu->src[i].sel, &nalu->src[i].neg);
1136 }
1137 if (nalu->dst.sel >= bc->ngpr) {
1138 bc->ngpr = nalu->dst.sel + 1;
1139 }
1140 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1141 /* each alu use 2 dwords */
1142 bc->cf_last->ndw += 2;
1143 bc->ndw += 2;
1144
1145 /* process cur ALU instructions for bank swizzle */
1146 if (nalu->last) {
1147 uint32_t literal[4];
1148 unsigned nliteral;
1149 struct r600_bytecode_alu *slots[5];
1150 int max_slots = bc->chip_class == CAYMAN ? 4 : 5;
1151 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1152 if (r)
1153 return r;
1154
1155 if (bc->cf_last->prev_bs_head) {
1156 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1157 if (r)
1158 return r;
1159 }
1160
1161 if (bc->cf_last->prev_bs_head) {
1162 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1163 if (r)
1164 return r;
1165 }
1166
1167 r = check_and_set_bank_swizzle(bc, slots);
1168 if (r)
1169 return r;
1170
1171 for (i = 0, nliteral = 0; i < max_slots; i++) {
1172 if (slots[i]) {
1173 r = r600_bytecode_alu_nliterals(bc, slots[i], literal, &nliteral);
1174 if (r)
1175 return r;
1176 }
1177 }
1178 bc->cf_last->ndw += align(nliteral, 2);
1179
1180 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1181 * worst case */
1182 if ((bc->cf_last->ndw >> 1) >= 120) {
1183 bc->force_add_cf = 1;
1184 }
1185
1186 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1187 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1188 bc->cf_last->curr_bs_head = NULL;
1189 }
1190
1191 if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst)
1192 insert_nop_r6xx(bc);
1193
1194 return 0;
1195 }
1196
1197 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1198 {
1199 return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU);
1200 }
1201
1202 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1203 {
1204 switch (bc->chip_class) {
1205 case R600:
1206 return 8;
1207
1208 case R700:
1209 case EVERGREEN:
1210 case CAYMAN:
1211 return 16;
1212
1213 default:
1214 R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1215 return 8;
1216 }
1217 }
1218
1219 static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc)
1220 {
1221 return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) &&
1222 (bc->chip_class == CAYMAN ||
1223 bc->cf_last->op != CF_OP_TEX));
1224 }
1225
1226 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1227 {
1228 struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1229 int r;
1230
1231 if (nvtx == NULL)
1232 return -ENOMEM;
1233 memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1234
1235 /* cf can contains only alu or only vtx or only tex */
1236 if (bc->cf_last == NULL ||
1237 last_inst_was_not_vtx_fetch(bc) ||
1238 bc->force_add_cf) {
1239 r = r600_bytecode_add_cf(bc);
1240 if (r) {
1241 free(nvtx);
1242 return r;
1243 }
1244 switch (bc->chip_class) {
1245 case R600:
1246 case R700:
1247 case EVERGREEN:
1248 bc->cf_last->op = CF_OP_VTX;
1249 break;
1250 case CAYMAN:
1251 bc->cf_last->op = CF_OP_TEX;
1252 break;
1253 default:
1254 R600_ERR("Unknown chip class %d.\n", bc->chip_class);
1255 free(nvtx);
1256 return -EINVAL;
1257 }
1258 }
1259 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1260 /* each fetch use 4 dwords */
1261 bc->cf_last->ndw += 4;
1262 bc->ndw += 4;
1263 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1264 bc->force_add_cf = 1;
1265
1266 bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1267 bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1268
1269 return 0;
1270 }
1271
1272 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1273 {
1274 struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1275 int r;
1276
1277 if (ntex == NULL)
1278 return -ENOMEM;
1279 memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1280
1281 /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1282 if (bc->cf_last != NULL &&
1283 bc->cf_last->op == CF_OP_TEX) {
1284 struct r600_bytecode_tex *ttex;
1285 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1286 if (ttex->dst_gpr == ntex->src_gpr) {
1287 bc->force_add_cf = 1;
1288 break;
1289 }
1290 }
1291 /* slight hack to make gradients always go into same cf */
1292 if (ntex->op == FETCH_OP_SET_GRADIENTS_H)
1293 bc->force_add_cf = 1;
1294 }
1295
1296 /* cf can contains only alu or only vtx or only tex */
1297 if (bc->cf_last == NULL ||
1298 bc->cf_last->op != CF_OP_TEX ||
1299 bc->force_add_cf) {
1300 r = r600_bytecode_add_cf(bc);
1301 if (r) {
1302 free(ntex);
1303 return r;
1304 }
1305 bc->cf_last->op = CF_OP_TEX;
1306 }
1307 if (ntex->src_gpr >= bc->ngpr) {
1308 bc->ngpr = ntex->src_gpr + 1;
1309 }
1310 if (ntex->dst_gpr >= bc->ngpr) {
1311 bc->ngpr = ntex->dst_gpr + 1;
1312 }
1313 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1314 /* each texture fetch use 4 dwords */
1315 bc->cf_last->ndw += 4;
1316 bc->ndw += 4;
1317 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1318 bc->force_add_cf = 1;
1319 return 0;
1320 }
1321
1322 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
1323 {
1324 int r;
1325 r = r600_bytecode_add_cf(bc);
1326 if (r)
1327 return r;
1328
1329 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1330 bc->cf_last->op = op;
1331 return 0;
1332 }
1333
1334 int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1335 {
1336 return r600_bytecode_add_cfinst(bc, CF_OP_CF_END);
1337 }
1338
1339 /* common to all 3 families */
1340 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1341 {
1342 bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1343 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1344 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1345 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1346 if (bc->chip_class < CAYMAN)
1347 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1348 id++;
1349 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1350 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1351 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1352 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1353 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1354 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1355 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1356 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1357 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1358 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1359 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1360 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1361 if (bc->chip_class < CAYMAN)
1362 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1363 id++;
1364 bc->bytecode[id++] = 0;
1365 return 0;
1366 }
1367
1368 /* common to all 3 families */
1369 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1370 {
1371 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(
1372 r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) |
1373 EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) |
1374 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1375 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1376 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1377 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1378 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1379 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1380 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1381 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1382 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1383 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1384 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1385 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1386 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1387 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1388 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1389 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1390 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1391 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1392 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1393 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1394 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1395 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1396 bc->bytecode[id++] = 0;
1397 return 0;
1398 }
1399
1400 /* r600 only, r700/eg bits in r700_asm.c */
1401 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1402 {
1403 unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op);
1404
1405 /* don't replace gpr by pv or ps for destination register */
1406 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1407 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1408 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1409 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1410 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1411 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1412 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1413 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1414 S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1415 S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1416 S_SQ_ALU_WORD0_LAST(alu->last);
1417
1418 if (alu->is_op3) {
1419 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1420 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1421 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1422 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1423 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1424 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1425 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1426 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1427 S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) |
1428 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1429 } else {
1430 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1431 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1432 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1433 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1434 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1435 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1436 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1437 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1438 S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) |
1439 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1440 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1441 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1442 }
1443 return 0;
1444 }
1445
1446 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1447 {
1448 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1449 *bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) |
1450 S_SQ_CF_WORD1_BARRIER(1) |
1451 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1452 }
1453
1454 /* common for r600/r700 - eg in eg_asm.c */
1455 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1456 {
1457 unsigned id = cf->id;
1458 const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1459 unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op);
1460
1461 if (cfop->flags & CF_ALU) {
1462 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1463 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1464 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1465 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1466
1467 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) |
1468 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1469 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1470 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1471 S_SQ_CF_ALU_WORD1_BARRIER(1) |
1472 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) |
1473 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1474 } else if (cfop->flags & CF_FETCH) {
1475 if (bc->chip_class == R700)
1476 r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1477 else
1478 r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1479 } else if (cfop->flags & CF_EXP) {
1480 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1481 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1482 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1483 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1484 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1485 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1486 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1487 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1488 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1489 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1490 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1491 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1492 } else if (cfop->flags & CF_STRM) {
1493 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1494 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1495 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1496 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1497 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1498 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1499 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1500 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
1501 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1502 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1503 } else {
1504 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1505 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) |
1506 S_SQ_CF_WORD1_BARRIER(1) |
1507 S_SQ_CF_WORD1_COND(cf->cond) |
1508 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1509 }
1510 return 0;
1511 }
1512
1513 int r600_bytecode_build(struct r600_bytecode *bc)
1514 {
1515 struct r600_bytecode_cf *cf;
1516 struct r600_bytecode_alu *alu;
1517 struct r600_bytecode_vtx *vtx;
1518 struct r600_bytecode_tex *tex;
1519 uint32_t literal[4];
1520 unsigned nliteral;
1521 unsigned addr;
1522 int i, r;
1523
1524 if (bc->callstack[0].max > 0)
1525 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1526 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1527 bc->nstack = 1;
1528 }
1529
1530 /* first path compute addr of each CF block */
1531 /* addr start after all the CF instructions */
1532 addr = bc->cf_last->id + 2;
1533 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1534 if (r600_isa_cf(cf->op)->flags & CF_FETCH) {
1535 addr += 3;
1536 addr &= 0xFFFFFFFCUL;
1537 }
1538 cf->addr = addr;
1539 addr += cf->ndw;
1540 bc->ndw = cf->addr + cf->ndw;
1541 }
1542 free(bc->bytecode);
1543 bc->bytecode = calloc(1, bc->ndw * 4);
1544 if (bc->bytecode == NULL)
1545 return -ENOMEM;
1546 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1547 const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1548 addr = cf->addr;
1549 if (bc->chip_class >= EVERGREEN)
1550 r = eg_bytecode_cf_build(bc, cf);
1551 else
1552 r = r600_bytecode_cf_build(bc, cf);
1553 if (r)
1554 return r;
1555 if (cfop->flags & CF_ALU) {
1556 nliteral = 0;
1557 memset(literal, 0, sizeof(literal));
1558 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1559 r = r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
1560 if (r)
1561 return r;
1562 r600_bytecode_alu_adjust_literals(bc, alu, literal, nliteral);
1563 r600_bytecode_assign_kcache_banks(bc, alu, cf->kcache);
1564
1565 switch(bc->chip_class) {
1566 case R600:
1567 r = r600_bytecode_alu_build(bc, alu, addr);
1568 break;
1569 case R700:
1570 case EVERGREEN: /* eg alu is same encoding as r700 */
1571 case CAYMAN:
1572 r = r700_bytecode_alu_build(bc, alu, addr);
1573 break;
1574 default:
1575 R600_ERR("unknown chip class %d.\n", bc->chip_class);
1576 return -EINVAL;
1577 }
1578 if (r)
1579 return r;
1580 addr += 2;
1581 if (alu->last) {
1582 for (i = 0; i < align(nliteral, 2); ++i) {
1583 bc->bytecode[addr++] = literal[i];
1584 }
1585 nliteral = 0;
1586 memset(literal, 0, sizeof(literal));
1587 }
1588 }
1589 } else if (cf->op == CF_OP_VTX) {
1590 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1591 r = r600_bytecode_vtx_build(bc, vtx, addr);
1592 if (r)
1593 return r;
1594 addr += 4;
1595 }
1596 } else if (cf->op == CF_OP_TEX) {
1597 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1598 assert(bc->chip_class >= EVERGREEN);
1599 r = r600_bytecode_vtx_build(bc, vtx, addr);
1600 if (r)
1601 return r;
1602 addr += 4;
1603 }
1604 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1605 r = r600_bytecode_tex_build(bc, tex, addr);
1606 if (r)
1607 return r;
1608 addr += 4;
1609 }
1610 }
1611 }
1612 return 0;
1613 }
1614
1615 void r600_bytecode_clear(struct r600_bytecode *bc)
1616 {
1617 struct r600_bytecode_cf *cf = NULL, *next_cf;
1618
1619 free(bc->bytecode);
1620 bc->bytecode = NULL;
1621
1622 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1623 struct r600_bytecode_alu *alu = NULL, *next_alu;
1624 struct r600_bytecode_tex *tex = NULL, *next_tex;
1625 struct r600_bytecode_tex *vtx = NULL, *next_vtx;
1626
1627 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1628 free(alu);
1629 }
1630
1631 LIST_INITHEAD(&cf->alu);
1632
1633 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1634 free(tex);
1635 }
1636
1637 LIST_INITHEAD(&cf->tex);
1638
1639 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1640 free(vtx);
1641 }
1642
1643 LIST_INITHEAD(&cf->vtx);
1644
1645 free(cf);
1646 }
1647
1648 LIST_INITHEAD(&cf->list);
1649 }
1650
1651 static int print_swizzle(unsigned swz)
1652 {
1653 const char * swzchars = "xyzw01?_";
1654 assert(swz<8 && swz != 6);
1655 return fprintf(stderr, "%c", swzchars[swz]);
1656 }
1657
1658 static int print_sel(unsigned sel, unsigned rel, unsigned index_mode,
1659 unsigned need_brackets)
1660 {
1661 int o = 0;
1662 if (rel && index_mode >= 5 && sel < 128)
1663 o += fprintf(stderr, "G");
1664 if (rel || need_brackets) {
1665 o += fprintf(stderr, "[");
1666 }
1667 o += fprintf(stderr, "%d", sel);
1668 if (rel) {
1669 if (index_mode == 0 || index_mode == 6)
1670 o += fprintf(stderr, "+AR");
1671 else if (index_mode == 4)
1672 o += fprintf(stderr, "+AL");
1673 }
1674 if (rel || need_brackets) {
1675 o += fprintf(stderr, "]");
1676 }
1677 return o;
1678 }
1679
1680 static int print_dst(struct r600_bytecode_alu *alu)
1681 {
1682 int o = 0;
1683 unsigned sel = alu->dst.sel;
1684 char reg_char = 'R';
1685 if (sel > 128 - 4) { /* clause temporary gpr */
1686 sel -= 128 - 4;
1687 reg_char = 'T';
1688 }
1689
1690 if (alu->dst.write || alu->is_op3) {
1691 o += fprintf(stderr, "%c", reg_char);
1692 o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0);
1693 } else {
1694 o += fprintf(stderr, "__");
1695 }
1696 o += fprintf(stderr, ".");
1697 o += print_swizzle(alu->dst.chan);
1698 return o;
1699 }
1700
1701 static int print_src(struct r600_bytecode_alu *alu, unsigned idx)
1702 {
1703 int o = 0;
1704 struct r600_bytecode_alu_src *src = &alu->src[idx];
1705 unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0;
1706
1707 if (src->neg)
1708 o += fprintf(stderr,"-");
1709 if (src->abs)
1710 o += fprintf(stderr,"|");
1711
1712 if (sel < 128 - 4) {
1713 o += fprintf(stderr, "R");
1714 } else if (sel < 128) {
1715 o += fprintf(stderr, "T");
1716 sel -= 128 - 4;
1717 } else if (sel < 160) {
1718 o += fprintf(stderr, "KC0");
1719 need_brackets = 1;
1720 sel -= 128;
1721 } else if (sel < 192) {
1722 o += fprintf(stderr, "KC1");
1723 need_brackets = 1;
1724 sel -= 160;
1725 } else if (sel >= 512) {
1726 o += fprintf(stderr, "C%d", src->kc_bank);
1727 need_brackets = 1;
1728 sel -= 512;
1729 } else if (sel >= 448) {
1730 o += fprintf(stderr, "Param");
1731 sel -= 448;
1732 need_chan = 0;
1733 } else if (sel >= 288) {
1734 o += fprintf(stderr, "KC3");
1735 need_brackets = 1;
1736 sel -= 288;
1737 } else if (sel >= 256) {
1738 o += fprintf(stderr, "KC2");
1739 need_brackets = 1;
1740 sel -= 256;
1741 } else {
1742 need_sel = 0;
1743 need_chan = 0;
1744 switch (sel) {
1745 case V_SQ_ALU_SRC_PS:
1746 o += fprintf(stderr, "PS");
1747 break;
1748 case V_SQ_ALU_SRC_PV:
1749 o += fprintf(stderr, "PV");
1750 need_chan = 1;
1751 break;
1752 case V_SQ_ALU_SRC_LITERAL:
1753 o += fprintf(stderr, "[0x%08X %f]", src->value, *(float*)&src->value);
1754 break;
1755 case V_SQ_ALU_SRC_0_5:
1756 o += fprintf(stderr, "0.5");
1757 break;
1758 case V_SQ_ALU_SRC_M_1_INT:
1759 o += fprintf(stderr, "-1");
1760 break;
1761 case V_SQ_ALU_SRC_1_INT:
1762 o += fprintf(stderr, "1");
1763 break;
1764 case V_SQ_ALU_SRC_1:
1765 o += fprintf(stderr, "1.0");
1766 break;
1767 case V_SQ_ALU_SRC_0:
1768 o += fprintf(stderr, "0");
1769 break;
1770 default:
1771 o += fprintf(stderr, "??IMM_%d", sel);
1772 break;
1773 }
1774 }
1775
1776 if (need_sel)
1777 o += print_sel(sel, src->rel, alu->index_mode, need_brackets);
1778
1779 if (need_chan) {
1780 o += fprintf(stderr, ".");
1781 o += print_swizzle(src->chan);
1782 }
1783
1784 if (src->abs)
1785 o += fprintf(stderr,"|");
1786
1787 return o;
1788 }
1789
1790 static int print_indent(int p, int c)
1791 {
1792 int o = 0;
1793 while (p++ < c)
1794 o += fprintf(stderr, " ");
1795 return o;
1796 }
1797
1798 void r600_bytecode_disasm(struct r600_bytecode *bc)
1799 {
1800 static int index = 0;
1801 struct r600_bytecode_cf *cf = NULL;
1802 struct r600_bytecode_alu *alu = NULL;
1803 struct r600_bytecode_vtx *vtx = NULL;
1804 struct r600_bytecode_tex *tex = NULL;
1805
1806 unsigned i, id, ngr = 0, last;
1807 uint32_t literal[4];
1808 unsigned nliteral;
1809 char chip = '6';
1810
1811 switch (bc->chip_class) {
1812 case R700:
1813 chip = '7';
1814 break;
1815 case EVERGREEN:
1816 chip = 'E';
1817 break;
1818 case CAYMAN:
1819 chip = 'C';
1820 break;
1821 case R600:
1822 default:
1823 chip = '6';
1824 break;
1825 }
1826 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
1827 bc->ndw, bc->ngpr);
1828 fprintf(stderr, "shader %d -- %c\n", index++, chip);
1829
1830 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1831 id = cf->id;
1832 if (cf->op == CF_NATIVE) {
1833 fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id],
1834 bc->bytecode[id + 1]);
1835 } else {
1836 const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1837 if (cfop->flags & CF_ALU) {
1838 if (cf->eg_alu_extended) {
1839 fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id],
1840 bc->bytecode[id + 1], "ALU_EXT");
1841 id += 2;
1842 }
1843 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
1844 bc->bytecode[id + 1], cfop->name);
1845 fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr);
1846 for (i = 0; i < 4; ++i) {
1847 if (cf->kcache[i].mode) {
1848 int c_start = (cf->kcache[i].addr << 4);
1849 int c_end = c_start + (cf->kcache[i].mode << 4);
1850 fprintf(stderr, "KC%d[CB%d:%d-%d] ",
1851 i, cf->kcache[i].bank, c_start, c_end);
1852 }
1853 }
1854 fprintf(stderr, "\n");
1855 } else if (cfop->flags & CF_FETCH) {
1856 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
1857 bc->bytecode[id + 1], cfop->name);
1858 fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
1859 fprintf(stderr, "\n");
1860 } else if (cfop->flags & CF_EXP) {
1861 int o = 0;
1862 const char *exp_type[] = {"PIXEL", "POS ", "PARAM"};
1863 o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
1864 bc->bytecode[id + 1], cfop->name);
1865 o += print_indent(o, 43);
1866 o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
1867 if (cf->output.burst_count > 1) {
1868 o += fprintf(stderr, "%d-%d ", cf->output.array_base,
1869 cf->output.array_base + cf->output.burst_count - 1);
1870
1871 o += print_indent(o, 55);
1872 o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
1873 cf->output.gpr + cf->output.burst_count - 1);
1874 } else {
1875 o += fprintf(stderr, "%d ", cf->output.array_base);
1876 o += print_indent(o, 55);
1877 o += fprintf(stderr, "R%d.", cf->output.gpr);
1878 }
1879
1880 o += print_swizzle(cf->output.swizzle_x);
1881 o += print_swizzle(cf->output.swizzle_y);
1882 o += print_swizzle(cf->output.swizzle_z);
1883 o += print_swizzle(cf->output.swizzle_w);
1884
1885 print_indent(o, 67);
1886
1887 fprintf(stderr, " ES:%X ", cf->output.elem_size);
1888 if (!cf->output.barrier)
1889 fprintf(stderr, "NO_BARRIER ");
1890 if (cf->output.end_of_program)
1891 fprintf(stderr, "EOP ");
1892 fprintf(stderr, "\n");
1893 } else if (r600_isa_cf(cf->op)->flags & CF_STRM) {
1894 int o = 0;
1895 const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK",
1896 "WRITE_IND_ACK"};
1897 o += fprintf(stderr, "%04d %08X %08X %s ", id,
1898 bc->bytecode[id], bc->bytecode[id + 1], cfop->name);
1899 o += print_indent(o, 43);
1900 o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
1901 if (cf->output.burst_count > 1) {
1902 o += fprintf(stderr, "%d-%d ", cf->output.array_base,
1903 cf->output.array_base + cf->output.burst_count - 1);
1904 o += print_indent(o, 55);
1905 o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
1906 cf->output.gpr + cf->output.burst_count - 1);
1907 } else {
1908 o += fprintf(stderr, "%d ", cf->output.array_base);
1909 o += print_indent(o, 55);
1910 o += fprintf(stderr, "R%d.", cf->output.gpr);
1911 }
1912 for (i = 0; i < 4; ++i) {
1913 if (cf->output.comp_mask & (1 << i))
1914 o += print_swizzle(i);
1915 else
1916 o += print_swizzle(7);
1917 }
1918
1919 o += print_indent(o, 67);
1920
1921 fprintf(stderr, " ES:%i ", cf->output.elem_size);
1922 if (cf->output.array_size != 0xFFF)
1923 fprintf(stderr, "AS:%i ", cf->output.array_size);
1924 if (!cf->output.barrier)
1925 fprintf(stderr, "NO_BARRIER ");
1926 if (cf->output.end_of_program)
1927 fprintf(stderr, "EOP ");
1928 fprintf(stderr, "\n");
1929 } else {
1930 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id],
1931 bc->bytecode[id + 1], cfop->name);
1932 fprintf(stderr, "@%d ", cf->cf_addr);
1933 if (cf->cond)
1934 fprintf(stderr, "CND:%X ", cf->cond);
1935 if (cf->pop_count)
1936 fprintf(stderr, "POP:%X ", cf->pop_count);
1937 fprintf(stderr, "\n");
1938 }
1939 }
1940
1941 id = cf->addr;
1942 nliteral = 0;
1943 last = 1;
1944 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1945 const char *omod_str[] = {"","*2","*4","/2"};
1946 const struct alu_op_info *aop = r600_isa_alu(alu->op);
1947 int o = 0;
1948
1949 r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
1950 o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]);
1951 if (last)
1952 o += fprintf(stderr, "%4d ", ++ngr);
1953 else
1954 o += fprintf(stderr, " ");
1955 o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ',
1956 alu->update_pred ? 'P':' ',
1957 alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' ');
1958
1959 o += fprintf(stderr, "%s%s%s ", aop->name,
1960 omod_str[alu->omod], alu->dst.clamp ? "_sat":"");
1961
1962 o += print_indent(o,60);
1963 o += print_dst(alu);
1964 for (i = 0; i < aop->src_count; ++i) {
1965 o += fprintf(stderr, i == 0 ? ", ": ", ");
1966 o += print_src(alu, i);
1967 }
1968
1969 if (alu->bank_swizzle) {
1970 o += print_indent(o,75);
1971 o += fprintf(stderr, " BS:%d", alu->bank_swizzle);
1972 }
1973
1974 fprintf(stderr, "\n");
1975 id += 2;
1976
1977 if (alu->last) {
1978 for (i = 0; i < nliteral; i++, id++) {
1979 float *f = (float*)(bc->bytecode + id);
1980 o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]);
1981 print_indent(o, 60);
1982 fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id));
1983 }
1984 id += nliteral & 1;
1985 nliteral = 0;
1986 }
1987 last = alu->last;
1988 }
1989
1990 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1991 int o = 0;
1992 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id],
1993 bc->bytecode[id + 1], bc->bytecode[id + 2]);
1994
1995 o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name);
1996
1997 o += print_indent(o, 50);
1998
1999 o += fprintf(stderr, "R%d.", tex->dst_gpr);
2000 o += print_swizzle(tex->dst_sel_x);
2001 o += print_swizzle(tex->dst_sel_y);
2002 o += print_swizzle(tex->dst_sel_z);
2003 o += print_swizzle(tex->dst_sel_w);
2004
2005 o += fprintf(stderr, ", R%d.", tex->src_gpr);
2006 o += print_swizzle(tex->src_sel_x);
2007 o += print_swizzle(tex->src_sel_y);
2008 o += print_swizzle(tex->src_sel_z);
2009 o += print_swizzle(tex->src_sel_w);
2010
2011 o += fprintf(stderr, ", RID:%d", tex->resource_id);
2012 o += fprintf(stderr, ", SID:%d ", tex->sampler_id);
2013
2014 if (tex->lod_bias)
2015 fprintf(stderr, "LB:%d ", tex->lod_bias);
2016
2017 fprintf(stderr, "CT:%c%c%c%c ",
2018 tex->coord_type_x ? 'N' : 'U',
2019 tex->coord_type_y ? 'N' : 'U',
2020 tex->coord_type_z ? 'N' : 'U',
2021 tex->coord_type_w ? 'N' : 'U');
2022
2023 if (tex->offset_x)
2024 fprintf(stderr, "OX:%d ", tex->offset_x);
2025 if (tex->offset_y)
2026 fprintf(stderr, "OY:%d ", tex->offset_y);
2027 if (tex->offset_z)
2028 fprintf(stderr, "OZ:%d ", tex->offset_z);
2029
2030 id += 4;
2031 fprintf(stderr, "\n");
2032 }
2033
2034 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2035 int o = 0;
2036 const char * fetch_type[] = {"VERTEX", "INSTANCE", ""};
2037 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id],
2038 bc->bytecode[id + 1], bc->bytecode[id + 2]);
2039
2040 o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name);
2041
2042 o += print_indent(o, 50);
2043
2044 o += fprintf(stderr, "R%d.", vtx->dst_gpr);
2045 o += print_swizzle(vtx->dst_sel_x);
2046 o += print_swizzle(vtx->dst_sel_y);
2047 o += print_swizzle(vtx->dst_sel_z);
2048 o += print_swizzle(vtx->dst_sel_w);
2049
2050 o += fprintf(stderr, ", R%d.", vtx->src_gpr);
2051 o += print_swizzle(vtx->src_sel_x);
2052
2053 if (vtx->offset)
2054 fprintf(stderr, " +%db", vtx->offset);
2055
2056 o += print_indent(o, 55);
2057
2058 fprintf(stderr, ", RID:%d ", vtx->buffer_id);
2059
2060 fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]);
2061
2062 if (bc->chip_class < CAYMAN && vtx->mega_fetch_count)
2063 fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count);
2064
2065 fprintf(stderr, "UCF:%d ", vtx->use_const_fields);
2066 fprintf(stderr, "FMT(DTA:%d ", vtx->data_format);
2067 fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2068 fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2069 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2070
2071 id += 4;
2072 }
2073 }
2074
2075 fprintf(stderr, "--------------------------------------\n");
2076 }
2077
2078 void r600_bytecode_dump(struct r600_bytecode *bc)
2079 {
2080 struct r600_bytecode_cf *cf = NULL;
2081 struct r600_bytecode_alu *alu = NULL;
2082 struct r600_bytecode_vtx *vtx = NULL;
2083 struct r600_bytecode_tex *tex = NULL;
2084
2085 unsigned i, id;
2086 uint32_t literal[4];
2087 unsigned nliteral;
2088 char chip = '6';
2089
2090 switch (bc->chip_class) {
2091 case R700:
2092 chip = '7';
2093 break;
2094 case EVERGREEN:
2095 chip = 'E';
2096 break;
2097 case CAYMAN:
2098 chip = 'C';
2099 break;
2100 case R600:
2101 default:
2102 chip = '6';
2103 break;
2104 }
2105 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
2106 fprintf(stderr, " %c\n", chip);
2107
2108 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2109 id = cf->id;
2110 if (cf->op == CF_NATIVE) {
2111 fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
2112 fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
2113 } else {
2114 const struct cf_op_info *cfop = r600_isa_cf(cf->op);
2115 if (cfop->flags & CF_ALU) {
2116 if (cf->eg_alu_extended) {
2117 fprintf(stderr, "%04d %08X ALU_EXT0 ", id, bc->bytecode[id]);
2118 fprintf(stderr, "KCACHE_BANK2:%X ", cf->kcache[2].bank);
2119 fprintf(stderr, "KCACHE_BANK3:%X ", cf->kcache[3].bank);
2120 fprintf(stderr, "KCACHE_MODE2:%X\n", cf->kcache[2].mode);
2121 id++;
2122 fprintf(stderr, "%04d %08X ALU_EXT1 ", id, bc->bytecode[id]);
2123 fprintf(stderr, "KCACHE_MODE3:%X ", cf->kcache[3].mode);
2124 fprintf(stderr, "KCACHE_ADDR2:%X ", cf->kcache[2].addr);
2125 fprintf(stderr, "KCACHE_ADDR3:%X\n", cf->kcache[3].addr);
2126 id++;
2127 }
2128
2129 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2130 fprintf(stderr, "ADDR:%d ", cf->addr);
2131 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
2132 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
2133 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
2134 id++;
2135 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
2136 fprintf(stderr, "INST: %s ", cfop->name);
2137 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
2138 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
2139 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
2140 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
2141 } else if (cfop->flags & CF_FETCH) {
2142 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2143 fprintf(stderr, "ADDR:%d\n", cf->addr);
2144 id++;
2145 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
2146 fprintf(stderr, "INST: %s ", cfop->name);
2147 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
2148 } else if (cfop->flags & CF_EXP) {
2149 fprintf(stderr, "%04d %08X %s ", id, bc->bytecode[id],
2150 cfop->name);
2151 fprintf(stderr, "GPR:%X ", cf->output.gpr);
2152 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
2153 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
2154 fprintf(stderr, "TYPE:%X\n", cf->output.type);
2155 id++;
2156 fprintf(stderr, "%04d %08X %s ", id, bc->bytecode[id],
2157 cfop->name);
2158 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
2159 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
2160 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
2161 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
2162 fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2163 fprintf(stderr, "INST: %s ", r600_isa_cf(cf->op)->name);
2164 fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2165 fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2166 } else if (cfop->flags & CF_STRM) {
2167 fprintf(stderr, "%04d %08X EXPORT %s ", id, bc->bytecode[id],
2168 cfop->name);
2169 fprintf(stderr, "GPR:%X ", cf->output.gpr);
2170 fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
2171 fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
2172 fprintf(stderr, "TYPE:%X\n", cf->output.type);
2173 id++;
2174 fprintf(stderr, "%04d %08X EXPORT %s ", id, bc->bytecode[id],
2175 cfop->name);
2176 fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
2177 fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
2178 fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
2179 fprintf(stderr, "INST: %s ", cfop->name);
2180 fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
2181 fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
2182
2183 } else {
2184 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2185 fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
2186 id++;
2187 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
2188 fprintf(stderr, "INST: %s ", cfop->name);
2189 fprintf(stderr, "COND:%X ", cf->cond);
2190 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
2191 }
2192 }
2193
2194 id = cf->addr;
2195 nliteral = 0;
2196 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2197 r600_bytecode_alu_nliterals(bc, alu, literal, &nliteral);
2198
2199 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2200 fprintf(stderr, " SRC0(SEL:%d ", alu->src[0].sel);
2201 fprintf(stderr, "REL:%d ", alu->src[0].rel);
2202 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
2203 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
2204 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
2205 fprintf(stderr, "REL:%d ", alu->src[1].rel);
2206 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
2207 fprintf(stderr, "NEG:%d ", alu->src[1].neg);
2208 fprintf(stderr, "IM:%d) ", alu->index_mode);
2209 fprintf(stderr, "PRED_SEL:%d ", alu->pred_sel);
2210 fprintf(stderr, "LAST:%d)\n", alu->last);
2211 id++;
2212 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
2213 fprintf(stderr, "INST: %s ", r600_isa_alu(alu->op)->name);
2214 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
2215 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
2216 fprintf(stderr, "REL:%d ", alu->dst.rel);
2217 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
2218 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
2219 if (alu->is_op3) {
2220 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
2221 fprintf(stderr, "REL:%d ", alu->src[2].rel);
2222 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
2223 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
2224 } else {
2225 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
2226 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
2227 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
2228 fprintf(stderr, "OMOD:%d ", alu->omod);
2229 fprintf(stderr, "EXECUTE_MASK:%d ", alu->execute_mask);
2230 fprintf(stderr, "UPDATE_PRED:%d\n", alu->update_pred);
2231 }
2232
2233 id++;
2234 if (alu->last) {
2235 for (i = 0; i < nliteral; i++, id++) {
2236 float *f = (float*)(bc->bytecode + id);
2237 fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
2238 *(bc->bytecode + id));
2239 }
2240 id += nliteral & 1;
2241 nliteral = 0;
2242 }
2243 }
2244
2245 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2246 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2247 fprintf(stderr, "INST: %s ", r600_isa_fetch(tex->op)->name);
2248 fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
2249 fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
2250 fprintf(stderr, "REL:%d)\n", tex->src_rel);
2251 id++;
2252 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2253 fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
2254 fprintf(stderr, "REL:%d ", tex->dst_rel);
2255 fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
2256 fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
2257 fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
2258 fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
2259 fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
2260 fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
2261 fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
2262 fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
2263 fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
2264 id++;
2265 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2266 fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
2267 fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
2268 fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
2269 fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
2270 fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
2271 fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
2272 fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
2273 fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
2274 id++;
2275 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2276 id++;
2277 }
2278
2279 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2280 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2281 fprintf(stderr, "INST: %s ", r600_isa_fetch(vtx->op)->name);
2282 fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2283 fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2284 id++;
2285 /* This assumes that no semantic fetches exist */
2286 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2287 fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2288 fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2289 if (bc->chip_class < CAYMAN)
2290 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2291 else
2292 fprintf(stderr, "SEL_Y:%d) ", 0);
2293 fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2294 fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2295 fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2296 fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2297 fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2298 fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2299 fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
2300 fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2301 fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2302 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2303 id++;
2304 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2305 fprintf(stderr, "ENDIAN:%d ", vtx->endian);
2306 fprintf(stderr, "OFFSET:%d\n", vtx->offset);
2307 /* XXX */
2308 id++;
2309 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2310 id++;
2311 }
2312 }
2313
2314 fprintf(stderr, "--------------------------------------\n");
2315 }
2316
2317 void r600_vertex_data_type(enum pipe_format pformat,
2318 unsigned *format,
2319 unsigned *num_format, unsigned *format_comp, unsigned *endian)
2320 {
2321 const struct util_format_description *desc;
2322 unsigned i;
2323
2324 *format = 0;
2325 *num_format = 0;
2326 *format_comp = 0;
2327 *endian = ENDIAN_NONE;
2328
2329 desc = util_format_description(pformat);
2330 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2331 goto out_unknown;
2332 }
2333
2334 /* Find the first non-VOID channel. */
2335 for (i = 0; i < 4; i++) {
2336 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2337 break;
2338 }
2339 }
2340
2341 *endian = r600_endian_swap(desc->channel[i].size);
2342
2343 switch (desc->channel[i].type) {
2344 /* Half-floats, floats, ints */
2345 case UTIL_FORMAT_TYPE_FLOAT:
2346 switch (desc->channel[i].size) {
2347 case 16:
2348 switch (desc->nr_channels) {
2349 case 1:
2350 *format = FMT_16_FLOAT;
2351 break;
2352 case 2:
2353 *format = FMT_16_16_FLOAT;
2354 break;
2355 case 3:
2356 case 4:
2357 *format = FMT_16_16_16_16_FLOAT;
2358 break;
2359 }
2360 break;
2361 case 32:
2362 switch (desc->nr_channels) {
2363 case 1:
2364 *format = FMT_32_FLOAT;
2365 break;
2366 case 2:
2367 *format = FMT_32_32_FLOAT;
2368 break;
2369 case 3:
2370 *format = FMT_32_32_32_FLOAT;
2371 break;
2372 case 4:
2373 *format = FMT_32_32_32_32_FLOAT;
2374 break;
2375 }
2376 break;
2377 default:
2378 goto out_unknown;
2379 }
2380 break;
2381 /* Unsigned ints */
2382 case UTIL_FORMAT_TYPE_UNSIGNED:
2383 /* Signed ints */
2384 case UTIL_FORMAT_TYPE_SIGNED:
2385 switch (desc->channel[i].size) {
2386 case 8:
2387 switch (desc->nr_channels) {
2388 case 1:
2389 *format = FMT_8;
2390 break;
2391 case 2:
2392 *format = FMT_8_8;
2393 break;
2394 case 3:
2395 case 4:
2396 *format = FMT_8_8_8_8;
2397 break;
2398 }
2399 break;
2400 case 10:
2401 if (desc->nr_channels != 4)
2402 goto out_unknown;
2403
2404 *format = FMT_2_10_10_10;
2405 break;
2406 case 16:
2407 switch (desc->nr_channels) {
2408 case 1:
2409 *format = FMT_16;
2410 break;
2411 case 2:
2412 *format = FMT_16_16;
2413 break;
2414 case 3:
2415 case 4:
2416 *format = FMT_16_16_16_16;
2417 break;
2418 }
2419 break;
2420 case 32:
2421 switch (desc->nr_channels) {
2422 case 1:
2423 *format = FMT_32;
2424 break;
2425 case 2:
2426 *format = FMT_32_32;
2427 break;
2428 case 3:
2429 *format = FMT_32_32_32;
2430 break;
2431 case 4:
2432 *format = FMT_32_32_32_32;
2433 break;
2434 }
2435 break;
2436 default:
2437 goto out_unknown;
2438 }
2439 break;
2440 default:
2441 goto out_unknown;
2442 }
2443
2444 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2445 *format_comp = 1;
2446 }
2447
2448 *num_format = 0;
2449 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2450 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2451 if (!desc->channel[i].normalized) {
2452 if (desc->channel[i].pure_integer)
2453 *num_format = 1;
2454 else
2455 *num_format = 2;
2456 }
2457 }
2458 return;
2459 out_unknown:
2460 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2461 }
2462
2463 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
2464 unsigned count,
2465 const struct pipe_vertex_element *elements)
2466 {
2467 struct r600_context *rctx = (struct r600_context *)ctx;
2468 static int dump_shaders = -1;
2469 struct r600_bytecode bc;
2470 struct r600_bytecode_vtx vtx;
2471 const struct util_format_description *desc;
2472 unsigned fetch_resource_start = rctx->chip_class >= EVERGREEN ? 0 : 160;
2473 unsigned format, num_format, format_comp, endian;
2474 uint32_t *bytecode;
2475 int i, j, r, fs_size;
2476 struct r600_fetch_shader *shader;
2477
2478 assert(count < 32);
2479
2480 memset(&bc, 0, sizeof(bc));
2481 r600_bytecode_init(&bc, rctx->chip_class, rctx->family,
2482 rctx->screen->msaa_texture_support);
2483
2484 bc.isa = rctx->isa;
2485
2486 for (i = 0; i < count; i++) {
2487 if (elements[i].instance_divisor > 1) {
2488 if (rctx->chip_class == CAYMAN) {
2489 for (j = 0; j < 4; j++) {
2490 struct r600_bytecode_alu alu;
2491 memset(&alu, 0, sizeof(alu));
2492 alu.op = ALU_OP2_MULHI_UINT;
2493 alu.src[0].sel = 0;
2494 alu.src[0].chan = 3;
2495 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2496 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2497 alu.dst.sel = i + 1;
2498 alu.dst.chan = j;
2499 alu.dst.write = j == 3;
2500 alu.last = j == 3;
2501 if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2502 r600_bytecode_clear(&bc);
2503 return NULL;
2504 }
2505 }
2506 } else {
2507 struct r600_bytecode_alu alu;
2508 memset(&alu, 0, sizeof(alu));
2509 alu.op = ALU_OP2_MULHI_UINT;
2510 alu.src[0].sel = 0;
2511 alu.src[0].chan = 3;
2512 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2513 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2514 alu.dst.sel = i + 1;
2515 alu.dst.chan = 3;
2516 alu.dst.write = 1;
2517 alu.last = 1;
2518 if ((r = r600_bytecode_add_alu(&bc, &alu))) {
2519 r600_bytecode_clear(&bc);
2520 return NULL;
2521 }
2522 }
2523 }
2524 }
2525
2526 for (i = 0; i < count; i++) {
2527 r600_vertex_data_type(elements[i].src_format,
2528 &format, &num_format, &format_comp, &endian);
2529
2530 desc = util_format_description(elements[i].src_format);
2531 if (desc == NULL) {
2532 r600_bytecode_clear(&bc);
2533 R600_ERR("unknown format %d\n", elements[i].src_format);
2534 return NULL;
2535 }
2536
2537 if (elements[i].src_offset > 65535) {
2538 r600_bytecode_clear(&bc);
2539 R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
2540 return NULL;
2541 }
2542
2543 memset(&vtx, 0, sizeof(vtx));
2544 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
2545 vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2546 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2547 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2548 vtx.mega_fetch_count = 0x1F;
2549 vtx.dst_gpr = i + 1;
2550 vtx.dst_sel_x = desc->swizzle[0];
2551 vtx.dst_sel_y = desc->swizzle[1];
2552 vtx.dst_sel_z = desc->swizzle[2];
2553 vtx.dst_sel_w = desc->swizzle[3];
2554 vtx.data_format = format;
2555 vtx.num_format_all = num_format;
2556 vtx.format_comp_all = format_comp;
2557 vtx.srf_mode_all = 1;
2558 vtx.offset = elements[i].src_offset;
2559 vtx.endian = endian;
2560
2561 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
2562 r600_bytecode_clear(&bc);
2563 return NULL;
2564 }
2565 }
2566
2567 r600_bytecode_add_cfinst(&bc, CF_OP_RET);
2568
2569 if ((r = r600_bytecode_build(&bc))) {
2570 r600_bytecode_clear(&bc);
2571 return NULL;
2572 }
2573
2574 if (dump_shaders == -1)
2575 dump_shaders = debug_get_num_option("R600_DUMP_SHADERS", 0);
2576
2577 if (dump_shaders & 1) {
2578 fprintf(stderr, "--------------------------------------------------------------\n");
2579 r600_bytecode_dump(&bc);
2580 fprintf(stderr, "______________________________________________________________\n");
2581 }
2582 if (dump_shaders & 2) {
2583 fprintf(stderr, "--------------------------------------------------------------\n");
2584 r600_bytecode_disasm(&bc);
2585 fprintf(stderr, "______________________________________________________________\n");
2586 }
2587
2588 fs_size = bc.ndw*4;
2589
2590 /* Allocate the CSO. */
2591 shader = CALLOC_STRUCT(r600_fetch_shader);
2592 if (!shader) {
2593 r600_bytecode_clear(&bc);
2594 return NULL;
2595 }
2596
2597 u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, &shader->offset,
2598 (struct pipe_resource**)&shader->buffer);
2599 if (!shader->buffer) {
2600 r600_bytecode_clear(&bc);
2601 FREE(shader);
2602 return NULL;
2603 }
2604
2605 bytecode = r600_buffer_mmap_sync_with_rings(rctx, shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
2606 bytecode += shader->offset / 4;
2607
2608 if (R600_BIG_ENDIAN) {
2609 for (i = 0; i < fs_size / 4; ++i) {
2610 bytecode[i] = bswap_32(bc.bytecode[i]);
2611 }
2612 } else {
2613 memcpy(bytecode, bc.bytecode, fs_size);
2614 }
2615 rctx->ws->buffer_unmap(shader->buffer->cs_buf);
2616
2617 r600_bytecode_clear(&bc);
2618 return shader;
2619 }
2620
2621 void r600_bytecode_alu_read(struct r600_bytecode *bc,
2622 struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1)
2623 {
2624 /* WORD0 */
2625 alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0);
2626 alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0);
2627 alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0);
2628 alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0);
2629 alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0);
2630 alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0);
2631 alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0);
2632 alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0);
2633 alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0);
2634 alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0);
2635 alu->last = G_SQ_ALU_WORD0_LAST(word0);
2636
2637 /* WORD1 */
2638 alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1);
2639 if (alu->bank_swizzle)
2640 alu->bank_swizzle_force = alu->bank_swizzle;
2641 alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1);
2642 alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1);
2643 alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1);
2644 alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1);
2645 if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/
2646 {
2647 alu->is_op3 = 1;
2648 alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1);
2649 alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1);
2650 alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1);
2651 alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1);
2652 alu->op = r600_isa_alu_by_opcode(bc->isa,
2653 G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1);
2654
2655 }
2656 else /*ALU_DWORD1_OP2*/
2657 {
2658 alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1);
2659 alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1);
2660 alu->op = r600_isa_alu_by_opcode(bc->isa,
2661 G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0);
2662 alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1);
2663 alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1);
2664 alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1);
2665 alu->execute_mask =
2666 G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1);
2667 }
2668 }
2669
2670 void r600_bytecode_export_read(struct r600_bytecode *bc,
2671 struct r600_bytecode_output *output, uint32_t word0, uint32_t word1)
2672 {
2673 output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0);
2674 output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0);
2675 output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0);
2676 output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0);
2677
2678 output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1);
2679 output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1);
2680 output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1);
2681 output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1);
2682 output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1);
2683 output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1);
2684 output->op = r600_isa_cf_by_opcode(bc->isa,
2685 G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0);
2686 output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1);
2687 output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1);
2688 output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1);
2689 }