2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
30 #define BCP_DUMP(q) do { q } while (0)
36 #include "r600_pipe.h"
37 #include "r600_shader.h"
43 #include "sb_shader.h"
50 int bc_parser::parse() {
56 dec
= new bc_decoder(ctx
, dw
, bc_ndw
);
58 shader_target t
= TARGET_UNKNOWN
;
62 case TGSI_PROCESSOR_FRAGMENT
: t
= TARGET_PS
; break;
63 case TGSI_PROCESSOR_VERTEX
: t
= TARGET_VS
; break;
64 case TGSI_PROCESSOR_COMPUTE
: t
= TARGET_COMPUTE
; break;
65 default: assert(!"unknown shader target"); return -1; break;
68 if (bc
->type
== TGSI_PROCESSOR_COMPUTE
)
74 sh
= new shader(ctx
, t
, bc
->debug_id
, enable_dump
);
75 int r
= parse_shader();
81 sh
->nstack
= bc
->nstack
;
83 if (sh
->target
!= TARGET_FETCH
) {
84 sh
->src_stats
.ndw
= bc
->ndw
;
85 sh
->collect_stats(false);
89 bc_dump(*sh
, cerr
, bc
->bytecode
, bc_ndw
).run();
101 int bc_parser::parse_shader() {
113 if ((r
= parse_cf(i
, eop
)))
116 } while (!eop
|| (i
>> 1) <= max_cf
);
121 int bc_parser::parse_decls() {
123 // sh->prepare_regs(rs.bc.ngpr);
125 if (pshader
->indirect_files
& ~(1 << TGSI_FILE_CONSTANT
)) {
129 sh
->add_gpr_array(0, pshader
->bc
.ngpr
, 0x0F);
133 assert(pshader
->num_arrays
);
135 if (pshader
->num_arrays
) {
137 for (unsigned i
= 0; i
< pshader
->num_arrays
; ++i
) {
138 r600_shader_array
&a
= pshader
->arrays
[i
];
139 sh
->add_gpr_array(a
.gpr_start
, a
.gpr_count
, a
.comp_mask
);
143 sh
->add_gpr_array(0, pshader
->bc
.ngpr
, 0x0F);
151 if (sh
->target
== TARGET_VS
)
152 sh
->add_input(0, 1, 0x0F);
154 bool ps_interp
= ctx
.hw_class
>= HW_CLASS_EVERGREEN
155 && sh
->target
== TARGET_PS
;
157 unsigned linear
= 0, persp
= 0, centroid
= 1;
159 for (unsigned i
= 0; i
< pshader
->ninput
; ++i
) {
160 r600_shader_io
& in
= pshader
->input
[i
];
161 bool preloaded
= sh
->target
== TARGET_PS
&& !(ps_interp
&& in
.spi_sid
);
162 sh
->add_input(in
.gpr
, preloaded
, /*in.write_mask*/ 0x0F);
163 if (ps_interp
&& in
.spi_sid
) {
164 if (in
.interpolate
== TGSI_INTERPOLATE_LINEAR
||
165 in
.interpolate
== TGSI_INTERPOLATE_COLOR
)
167 else if (in
.interpolate
== TGSI_INTERPOLATE_PERSPECTIVE
)
175 unsigned mask
= (1 << (2 * (linear
+ persp
) * centroid
)) - 1;
179 sh
->add_input(gpr
, true, mask
& 0x0F);
190 int bc_parser::parse_cf(unsigned &i
, bool &eop
) {
194 cf_node
*cf
= sh
->create_cf();
195 sh
->root
->push_back(cf
);
197 unsigned id
= i
>> 1;
201 if (cf_map
.size() < id
+ 1)
202 cf_map
.resize(id
+ 1);
206 if ((r
= dec
->decode_cf(i
, cf
->bc
)))
209 cf_op_flags flags
= (cf_op_flags
)cf
->bc
.op_ptr
->flags
;
211 if (flags
& CF_ALU
) {
212 if ((r
= parse_alu_clause(cf
)))
214 } else if (flags
& CF_FETCH
) {
215 if ((r
= parse_fetch_clause(cf
)))
217 } else if (flags
& CF_EXP
) {
218 assert(!cf
->bc
.rw_rel
);
219 } else if (flags
& (CF_STRM
| CF_RAT
)) {
220 assert(!cf
->bc
.rw_rel
);
221 } else if (cf
->bc
.op
== CF_OP_CALL_FS
) {
222 sh
->init_call_fs(cf
);
223 cf
->flags
|= NF_SCHEDULE_EARLY
| NF_DONT_MOVE
;
224 } else if (flags
& CF_BRANCH
) {
225 if (cf
->bc
.addr
> max_cf
)
226 max_cf
= cf
->bc
.addr
;
229 eop
= cf
->bc
.end_of_program
|| cf
->bc
.op
== CF_OP_CF_END
||
230 cf
->bc
.op
== CF_OP_RET
;
234 int bc_parser::parse_alu_clause(cf_node
* cf
) {
235 unsigned i
= cf
->bc
.addr
<< 1, cnt
= cf
->bc
.count
+ 1, gcnt
;
237 cf
->subtype
= NST_ALU_CLAUSE
;
240 memset(slots
[0], 0, 5*sizeof(slots
[0][0]));
245 parse_alu_group(cf
, i
, gcnt
);
254 int bc_parser::parse_alu_group(cf_node
* cf
, unsigned &i
, unsigned &gcnt
) {
257 alu_group_node
*g
= sh
->create_alu_group();
260 memset(slots
[cgroup
], 0, 5*sizeof(slots
[0][0]));
265 n
= sh
->create_alu();
268 if ((r
= dec
->decode_alu(i
, n
->bc
)))
271 if (!sh
->assign_slot(n
, slots
[cgroup
])) {
272 assert(!"alu slot assignment failed");
278 } while (gcnt
<= 5 && !n
->bc
.last
);
282 unsigned literal_mask
= 0;
284 for (node_iterator I
= g
->begin(), E
= g
->end();
286 n
= static_cast<alu_node
*>(*I
);
287 unsigned src_count
= n
->bc
.op_ptr
->src_count
;
289 if (ctx
.alu_slots(n
->bc
.op
) & AF_4SLOT
)
290 n
->flags
|= NF_ALU_4SLOT
;
292 n
->src
.resize(src_count
);
294 unsigned flags
= n
->bc
.op_ptr
->flags
;
296 if (flags
& AF_PRED
) {
298 if (n
->bc
.update_pred
)
299 n
->dst
[1] = sh
->get_special_value(SV_ALU_PRED
);
300 if (n
->bc
.update_exec_mask
)
301 n
->dst
[2] = sh
->get_special_value(SV_EXEC_MASK
);
303 n
->flags
|= NF_DONT_HOIST
;
305 } else if (flags
& AF_KILL
) {
308 n
->dst
[1] = sh
->get_special_value(SV_VALID_MASK
);
311 n
->flags
|= NF_DONT_HOIST
| NF_DONT_MOVE
|
312 NF_DONT_KILL
| NF_SCHEDULE_EARLY
;
318 if (flags
& AF_MOVA
) {
320 n
->dst
[0] = sh
->get_special_value(SV_AR_INDEX
);
322 n
->flags
|= NF_DONT_HOIST
;
324 } else if (n
->bc
.op_ptr
->src_count
== 3 || n
->bc
.write_mask
) {
325 assert(!n
->bc
.dst_rel
|| n
->bc
.index_mode
== INDEX_AR_X
);
327 value
*v
= sh
->get_gpr_value(false, n
->bc
.dst_gpr
, n
->bc
.dst_chan
,
333 if (n
->bc
.pred_sel
) {
334 sh
->has_alu_predication
= true;
335 n
->pred
= sh
->get_special_value(SV_ALU_PRED
);
338 for (unsigned s
= 0; s
< src_count
; ++s
) {
339 bc_alu_src
&src
= n
->bc
.src
[s
];
341 if (src
.sel
== ALU_SRC_LITERAL
) {
342 unsigned chan
= src
.chan
;
344 literal_mask
|= (1 << chan
);
345 src
.value
.u
= dw
[i
+chan
];
346 n
->src
[s
] = sh
->get_const_value(src
.value
);
347 } else if (src
.sel
== ALU_SRC_PS
|| src
.sel
== ALU_SRC_PV
) {
348 unsigned pgroup
= !cgroup
, prev_slot
= src
.sel
== ALU_SRC_PS
?
349 SLOT_TRANS
: src
.chan
;
350 alu_node
*prev_alu
= slots
[pgroup
][prev_slot
];
354 if (!prev_alu
->dst
[0]) {
355 value
* t
= sh
->create_temp_value();
356 prev_alu
->dst
[0] = t
;
359 value
*d
= prev_alu
->dst
[0];
362 d
= sh
->get_gpr_value(true, prev_alu
->bc
.dst_gpr
,
363 prev_alu
->bc
.dst_chan
,
364 prev_alu
->bc
.dst_rel
);
368 } else if (ctx
.is_kcache_sel(src
.sel
)) {
369 unsigned sel
= src
.sel
, kc_addr
;
370 unsigned kc_set
= ((sel
>> 7) & 2) + ((sel
>> 5) & 1);
372 bc_kcache
&kc
= cf
->bc
.kc
[kc_set
];
373 kc_addr
= (kc
.addr
<< 4) + (sel
& 0x1F);
374 n
->src
[s
] = sh
->get_kcache_value(kc
.bank
, kc_addr
, src
.chan
);
375 } else if (src
.sel
< MAX_GPR
) {
376 value
*v
= sh
->get_gpr_value(true, src
.sel
, src
.chan
, src
.rel
);
380 } else if (src
.sel
>= ALU_SRC_PARAM_OFFSET
) {
381 // using slot for value channel because in fact the slot
382 // determines the channel that is loaded by INTERP_LOAD_P0
383 // (and maybe some others).
384 // otherwise GVN will consider INTERP_LOAD_P0s with the same
385 // param index as equal instructions and leave only one of them
386 n
->src
[s
] = sh
->get_special_ro_value(sel_chan(src
.sel
,
391 n
->src
[s
] = sh
->get_const_value(0);
394 n
->src
[s
] = sh
->get_const_value(0.5f
);
397 n
->src
[s
] = sh
->get_const_value(1.0f
);
400 n
->src
[s
] = sh
->get_const_value(1);
402 case ALU_SRC_M_1_INT
:
403 n
->src
[s
] = sh
->get_const_value(-1);
406 n
->src
[s
] = sh
->get_special_ro_value(src
.sel
);
413 // pack multislot instructions into alu_packed_node
415 alu_packed_node
*p
= NULL
;
416 for (node_iterator N
, I
= g
->begin(), E
= g
->end(); I
!= E
; I
= N
) {
418 alu_node
*a
= static_cast<alu_node
*>(*I
);
419 unsigned sflags
= a
->bc
.slot_flags
;
421 if (sflags
== AF_4V
|| (ctx
.is_cayman() && sflags
== AF_S
)) {
423 p
= sh
->create_alu_packed();
434 unsigned literal_ndw
= 0;
435 while (literal_mask
) {
436 g
->literals
.push_back(dw
[i
+ literal_ndw
]);
441 literal_ndw
= (literal_ndw
+ 1) & ~1u;
444 gcnt
+= literal_ndw
>> 1;
450 int bc_parser::parse_fetch_clause(cf_node
* cf
) {
452 unsigned i
= cf
->bc
.addr
<< 1, cnt
= cf
->bc
.count
+ 1;
454 cf
->subtype
= NST_TEX_CLAUSE
;
459 fetch_node
*n
= sh
->create_fetch();
461 if ((r
= dec
->decode_fetch(i
, n
->bc
)))
464 unsigned flags
= n
->bc
.op_ptr
->flags
;
466 unsigned vtx
= flags
& FF_VTX
;
467 unsigned num_src
= vtx
? ctx
.vtx_src_num
: 4;
471 if (flags
& (FF_SETGRAD
| FF_USEGRAD
| FF_GETGRAD
)) {
472 sh
->uses_gradients
= true;
475 if (flags
& FF_SETGRAD
) {
480 case FETCH_OP_SET_GRADIENTS_V
:
483 case FETCH_OP_SET_GRADIENTS_H
:
487 assert(!"unexpected SET_GRAD instruction");
494 for(unsigned s
= 0; s
< 4; ++s
) {
495 unsigned sw
= n
->bc
.src_sel
[s
];
497 (*grad
)[s
] = sh
->get_gpr_value(true, n
->bc
.src_gpr
,
499 else if (sw
== SEL_0
)
500 (*grad
)[s
] = sh
->get_const_value(0.0f
);
501 else if (sw
== SEL_1
)
502 (*grad
)[s
] = sh
->get_const_value(1.0f
);
506 if (flags
& FF_USEGRAD
) {
508 std::copy(grad_v
.begin(), grad_v
.end(), n
->src
.begin() + 4);
509 std::copy(grad_h
.begin(), grad_h
.end(), n
->src
.begin() + 8);
514 for(int s
= 0; s
< 4; ++s
) {
515 if (n
->bc
.dst_sel
[s
] != SEL_MASK
)
516 n
->dst
[s
] = sh
->get_gpr_value(false, n
->bc
.dst_gpr
, s
, false);
517 // NOTE: it doesn't matter here which components of the result we
518 // are using, but original n->bc.dst_sel should be taken into
519 // account when building the bytecode
521 for(unsigned s
= 0; s
< num_src
; ++s
) {
522 if (n
->bc
.src_sel
[s
] <= SEL_W
)
523 n
->src
[s
] = sh
->get_gpr_value(true, n
->bc
.src_gpr
,
524 n
->bc
.src_sel
[s
], false);
532 int bc_parser::prepare_ir() {
534 for(id_cf_map::iterator I
= cf_map
.begin(), E
= cf_map
.end(); I
!= E
; ++I
) {
540 unsigned flags
= c
->bc
.op_ptr
->flags
;
542 if (flags
& CF_LOOP_START
) {
544 } else if (c
->bc
.op
== CF_OP_JUMP
) {
546 } else if (c
->bc
.op
== CF_OP_LOOP_END
) {
548 } else if (c
->bc
.op
== CF_OP_LOOP_CONTINUE
) {
549 assert(!loop_stack
.empty());
550 repeat_node
*rep
= sh
->create_repeat(loop_stack
.top());
551 if (c
->parent
->first
!= c
)
552 rep
->move(c
->parent
->first
, c
);
553 c
->replace_with(rep
);
554 sh
->simplify_dep_rep(rep
);
555 } else if (c
->bc
.op
== CF_OP_LOOP_BREAK
) {
556 assert(!loop_stack
.empty());
557 depart_node
*dep
= sh
->create_depart(loop_stack
.top());
558 if (c
->parent
->first
!= c
)
559 dep
->move(c
->parent
->first
, c
);
560 c
->replace_with(dep
);
561 sh
->simplify_dep_rep(dep
);
562 } else if (flags
& CF_ALU
&& ctx
.is_cayman()) {
563 // postprocess cayman's 3-slot instructions (ex-trans-only)
564 // FIXME it shouldn't be required with proper handling
565 prepare_alu_clause(c
);
566 } else if (flags
& CF_EXP
) {
568 // unroll burst exports
570 assert(c
->bc
.op
== CF_OP_EXPORT
|| c
->bc
.op
== CF_OP_EXPORT_DONE
);
572 c
->bc
.set_op(CF_OP_EXPORT
);
574 unsigned burst_count
= c
->bc
.burst_count
;
575 unsigned eop
= c
->bc
.end_of_program
;
577 c
->bc
.end_of_program
= 0;
578 c
->bc
.burst_count
= 0;
583 for(int s
= 0; s
< 4; ++s
) {
584 switch (c
->bc
.sel
[s
]) {
586 c
->src
[s
] = sh
->get_const_value(0.0f
);
589 c
->src
[s
] = sh
->get_const_value(1.0f
);
594 if (c
->bc
.sel
[s
] <= SEL_W
)
595 c
->src
[s
] = sh
->get_gpr_value(true, c
->bc
.rw_gpr
,
596 c
->bc
.sel
[s
], false);
598 assert(!"invalid src_sel for export");
605 cf_node
*cf_next
= sh
->create_cf();
607 ++cf_next
->bc
.rw_gpr
;
608 ++cf_next
->bc
.array_base
;
610 c
->insert_after(cf_next
);
615 c
->bc
.end_of_program
= eop
;
616 } else if (flags
& (CF_STRM
| CF_RAT
)) {
618 unsigned burst_count
= c
->bc
.burst_count
;
619 unsigned eop
= c
->bc
.end_of_program
;
621 c
->bc
.end_of_program
= 0;
622 c
->bc
.burst_count
= 0;
628 for(int s
= 0; s
< 4; ++s
) {
629 if (c
->bc
.comp_mask
& (1 << s
))
631 sh
->get_gpr_value(true, c
->bc
.rw_gpr
, s
, false);
634 if ((flags
& CF_RAT
) && (c
->bc
.type
& 1)) { // indexed write
636 for(int s
= 0; s
< 3; ++s
) {
638 sh
->get_gpr_value(true, c
->bc
.index_gpr
, s
, false);
641 // FIXME probably we can relax it a bit
642 c
->flags
|= NF_DONT_HOIST
| NF_DONT_MOVE
;
648 cf_node
*cf_next
= sh
->create_cf();
650 ++cf_next
->bc
.rw_gpr
;
652 // FIXME is it correct?
653 cf_next
->bc
.array_base
+= cf_next
->bc
.elem_size
+ 1;
655 c
->insert_after(cf_next
);
659 c
->bc
.end_of_program
= eop
;
664 assert(loop_stack
.empty());
668 int bc_parser::prepare_loop(cf_node
* c
) {
670 cf_node
*end
= cf_map
[c
->bc
.addr
- 1];
671 assert(end
->bc
.op
== CF_OP_LOOP_END
);
672 assert(c
->parent
== end
->parent
);
674 region_node
*reg
= sh
->create_region();
675 repeat_node
*rep
= sh
->create_repeat(reg
);
678 c
->insert_before(reg
);
679 rep
->move(c
, end
->next
);
681 loop_stack
.push(reg
);
685 int bc_parser::prepare_if(cf_node
* c
) {
686 cf_node
*c_else
= NULL
, *end
= cf_map
[c
->bc
.addr
];
689 cerr
<< "parsing JUMP @" << c
->bc
.id
;
693 if (end
->bc
.op
== CF_OP_ELSE
) {
695 cerr
<< " found ELSE : ";
701 end
= cf_map
[c_else
->bc
.addr
];
704 cerr
<< " no else\n";
710 if (c_else
->parent
!= c
->parent
)
713 if (end
->parent
!= c
->parent
)
716 region_node
*reg
= sh
->create_region();
718 depart_node
*dep2
= sh
->create_depart(reg
);
719 depart_node
*dep
= sh
->create_depart(reg
);
720 if_node
*n_if
= sh
->create_if();
722 c
->insert_before(reg
);
725 dep
->move(c_else
, end
);
729 dep
->push_front(n_if
);
730 n_if
->push_back(dep2
);
732 n_if
->cond
= sh
->get_special_value(SV_EXEC_MASK
);
737 int bc_parser::prepare_alu_clause(cf_node
* c
) {
739 // loop over alu groups
740 for (node_iterator I
= c
->begin(), E
= c
->end(); I
!= E
; ++I
) {
741 assert(I
->subtype
== NST_ALU_GROUP
);
743 alu_group_node
*g
= static_cast<alu_group_node
*>(*I
);
745 // loop over alu_group items
746 for (node_iterator I2
= g
->begin(), E2
= g
->end(); I2
!= E2
; ++I2
) {
747 if (I2
->subtype
!= NST_ALU_PACKED_INST
)
750 alu_packed_node
*p
= static_cast<alu_packed_node
*>(*I2
);
752 if (p
->count() == 3) {
753 // cayman's scalar instruction that takes 3 or 4 slots
755 // FIXME for simplicity we'll always add 4th slot,
756 // but probably we might want to always remove 4th slot and make
757 // sure that regalloc won't choose w component for dst
759 alu_node
*f
= static_cast<alu_node
*>(p
->first
);
760 alu_node
*a
= sh
->create_alu();
762 a
->dst
.resize(f
->dst
.size());
773 } // namespace r600_sb