2 * Mesa 3-D graphics library
4 * Copyright (C) 2012-2013 LunarG, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
25 * Chia-I Wu <olv@lunarg.com>
28 #include "tgsi/tgsi_dump.h"
29 #include "toy_compiler.h"
31 #include "toy_legalize.h"
32 #include "toy_optimize.h"
33 #include "toy_helpers.h"
34 #include "ilo_shader.h"
36 /* XXX Below is proof-of-concept code. Skip this file! */
40 * - primitive id is in r0.1. FS receives PID as a flat attribute.
41 * - set VUE header m0.1 for layered rendering
43 struct gs_compile_context
{
44 struct ilo_shader
*shader
;
45 const struct ilo_shader_variant
*variant
;
46 const struct pipe_stream_output_info
*so_info
;
48 struct toy_compiler tc
;
50 int output_map
[PIPE_MAX_SHADER_OUTPUTS
];
59 int out_vue_min_count
;
64 struct toy_src header
;
66 struct toy_src vues
[6];
70 struct toy_dst urb_write_header
;
77 /* buffered tgsi_outs */
78 struct toy_dst buffers
[3];
79 int buffer_needed
, buffer_cur
;
81 struct toy_dst so_written
;
82 struct toy_dst so_index
;
84 struct toy_src tgsi_outs
[PIPE_MAX_SHADER_OUTPUTS
];
88 struct toy_dst total_vertices
;
89 struct toy_dst total_prims
;
91 struct toy_dst num_vertices
;
92 struct toy_dst num_vertices_in_prim
;
98 /* this limits the max vertice count to be 256 */
99 uint32_t last_vertex
[8];
102 int num_vertices_in_prim
;
112 gs_COPY8(struct toy_compiler
*tc
, struct toy_dst dst
, struct toy_src src
)
114 struct toy_inst
*inst
;
116 inst
= tc_MOV(tc
, dst
, src
);
117 inst
->exec_size
= BRW_EXECUTE_8
;
118 inst
->mask_ctrl
= BRW_MASK_DISABLE
;
122 gs_COPY4(struct toy_compiler
*tc
,
123 struct toy_dst dst
, int dst_ch
,
124 struct toy_src src
, int src_ch
)
126 struct toy_inst
*inst
;
129 tdst_offset(dst
, 0, dst_ch
),
130 tsrc_offset(src
, 0, src_ch
));
131 inst
->exec_size
= BRW_EXECUTE_4
;
132 inst
->mask_ctrl
= BRW_MASK_DISABLE
;
136 gs_COPY1(struct toy_compiler
*tc
,
137 struct toy_dst dst
, int dst_ch
,
138 struct toy_src src
, int src_ch
)
140 struct toy_inst
*inst
;
143 tdst_offset(dst
, 0, dst_ch
),
144 tsrc_rect(tsrc_offset(src
, 0, src_ch
), TOY_RECT_010
));
145 inst
->exec_size
= BRW_EXECUTE_1
;
146 inst
->mask_ctrl
= BRW_MASK_DISABLE
;
150 gs_init_vars(struct gs_compile_context
*gcc
)
152 struct toy_compiler
*tc
= &gcc
->tc
;
155 /* init URB_WRITE header */
156 dst
= gcc
->vars
.urb_write_header
;
158 gs_COPY8(tc
, dst
, gcc
->payload
.header
);
160 gcc
->vars
.prim_start
= true;
161 gcc
->vars
.prim_end
= false;
162 switch (gcc
->out_vue_min_count
) {
164 gcc
->vars
.prim_type
= _3DPRIM_POINTLIST
;
167 gcc
->vars
.prim_type
= _3DPRIM_LINESTRIP
;
170 gcc
->vars
.prim_type
= _3DPRIM_TRISTRIP
;
175 tc_MOV(tc
, gcc
->vars
.so_written
, tsrc_imm_d(0));
179 gs_save_output(struct gs_compile_context
*gcc
, const struct toy_src
*outs
)
181 struct toy_compiler
*tc
= &gcc
->tc
;
182 const struct toy_dst buf
= gcc
->vars
.buffers
[gcc
->vars
.buffer_cur
];
185 for (i
= 0; i
< gcc
->shader
->out
.count
; i
++)
186 tc_MOV(tc
, tdst_offset(buf
, i
, 0), outs
[i
]);
188 /* advance the cursor */
189 gcc
->vars
.buffer_cur
++;
190 gcc
->vars
.buffer_cur
%= gcc
->vars
.buffer_needed
;
194 gs_write_so(struct gs_compile_context
*gcc
,
196 struct toy_src index
, struct toy_src out
,
197 bool send_write_commit_message
,
198 int binding_table_index
)
200 struct toy_compiler
*tc
= &gcc
->tc
;
201 struct toy_dst mrf_header
;
204 mrf_header
= tdst_d(tdst(TOY_FILE_MRF
, gcc
->first_free_mrf
, 0));
206 /* m0.5: destination index */
207 gs_COPY1(tc
, mrf_header
, 5, index
, 0);
209 /* m0.0 - m0.3: RGBA */
210 gs_COPY4(tc
, mrf_header
, 0, tsrc_type(out
, mrf_header
.type
), 0);
212 desc
= tsrc_imm_mdesc_data_port(tc
, false,
213 1, send_write_commit_message
,
214 true, send_write_commit_message
,
215 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE
, 0,
216 binding_table_index
);
218 tc_SEND(tc
, dst
, tsrc_from(mrf_header
), desc
,
219 GEN6_SFID_DATAPORT_RENDER_CACHE
);
223 gs_write_vue(struct gs_compile_context
*gcc
,
224 struct toy_dst dst
, struct toy_src msg_header
,
225 const struct toy_src
*outs
, int num_outs
,
228 struct toy_compiler
*tc
= &gcc
->tc
;
229 struct toy_dst mrf_header
;
233 mrf_header
= tdst_d(tdst(TOY_FILE_MRF
, gcc
->first_free_mrf
, 0));
234 gs_COPY8(tc
, mrf_header
, msg_header
);
236 while (sent
< num_outs
) {
237 int mrf
= gcc
->first_free_mrf
+ 1;
238 const int mrf_avail
= gcc
->last_free_mrf
- mrf
+ 1;
239 int msg_len
, num_entries
, i
;
242 num_entries
= (num_outs
- sent
+ 1) / 2;
244 if (num_entries
> mrf_avail
) {
245 num_entries
= mrf_avail
;
249 for (i
= 0; i
< num_entries
; i
++) {
250 gs_COPY4(tc
, tdst(TOY_FILE_MRF
, mrf
+ i
/ 2, 0), 0,
251 outs
[sent
+ 2 * i
], 0);
252 if (sent
+ i
* 2 + 1 < gcc
->shader
->out
.count
) {
253 gs_COPY4(tc
, tdst(TOY_FILE_MRF
, mrf
+ i
/ 2, 0), 4,
254 outs
[sent
+ 2 * i
+ 1], 0);
259 /* do not forget the header */
260 msg_len
= num_entries
+ 1;
263 desc
= tsrc_imm_mdesc_urb(tc
,
264 eot
, msg_len
, !eot
, true, true, !eot
,
265 BRW_URB_SWIZZLE_NONE
, sent
, 0);
268 desc
= tsrc_imm_mdesc_urb(tc
,
269 false, msg_len
, 0, false, true, false,
270 BRW_URB_SWIZZLE_NONE
, sent
, 0);
273 tc_add2(tc
, TOY_OPCODE_URB_WRITE
,
274 (complete
) ? dst
: tdst_null(), tsrc_from(mrf_header
), desc
);
276 sent
+= num_entries
* 2;
281 gs_ff_sync(struct gs_compile_context
*gcc
, struct toy_dst dst
,
282 struct toy_src num_prims
)
284 struct toy_compiler
*tc
= &gcc
->tc
;
285 struct toy_dst mrf_header
=
286 tdst_d(tdst(TOY_FILE_MRF
, gcc
->first_free_mrf
, 0));
290 gs_COPY8(tc
, mrf_header
, gcc
->payload
.header
);
292 /* set NumSOVertsToWrite and NumSOPrimsNeeded */
294 if (num_prims
.file
== TOY_FILE_IMM
) {
296 (num_prims
.val32
* gcc
->in_vue_count
) << 16 | num_prims
.val32
;
298 gs_COPY1(tc
, mrf_header
, 0, tsrc_imm_d(v
), 0);
301 struct toy_dst m0_0
= tdst_d(gcc
->vars
.tmp
);
303 tc_MUL(tc
, m0_0
, num_prims
, tsrc_imm_d(gcc
->in_vue_count
<< 16));
304 tc_OR(tc
, m0_0
, tsrc_from(m0_0
), num_prims
);
306 gs_COPY1(tc
, mrf_header
, 0, tsrc_from(m0_0
), 0);
310 /* set NumGSPrimsGenerated */
312 gs_COPY1(tc
, mrf_header
, 1, num_prims
, 0);
315 * From the Sandy Bridge PRM, volume 2 part 1, page 173:
317 * "Programming Note: If the GS stage is enabled, software must always
318 * allocate at least one GS URB Entry. This is true even if the GS
319 * thread never needs to output vertices to the pipeline, e.g., when
320 * only performing stream output. This is an artifact of the need to
321 * pass the GS thread an initial destination URB handle."
324 desc
= tsrc_imm_mdesc_urb(tc
, false, 1, 1,
325 false, false, allocate
,
326 BRW_URB_SWIZZLE_NONE
, 0, 1);
328 tc_SEND(tc
, dst
, tsrc_from(mrf_header
), desc
, BRW_SFID_URB
);
332 gs_discard(struct gs_compile_context
*gcc
)
334 struct toy_compiler
*tc
= &gcc
->tc
;
335 struct toy_dst mrf_header
;
338 mrf_header
= tdst_d(tdst(TOY_FILE_MRF
, gcc
->first_free_mrf
, 0));
340 gs_COPY8(tc
, mrf_header
, tsrc_from(gcc
->vars
.urb_write_header
));
342 desc
= tsrc_imm_mdesc_urb(tc
,
343 true, 1, 0, true, false, false,
344 BRW_URB_SWIZZLE_NONE
, 0, 0);
346 tc_add2(tc
, TOY_OPCODE_URB_WRITE
,
347 tdst_null(), tsrc_from(mrf_header
), desc
);
351 gs_lower_opcode_endprim(struct gs_compile_context
*gcc
, struct toy_inst
*inst
)
353 /* if has control flow, set PrimEnd on the last vertex and URB_WRITE */
357 gs_lower_opcode_emit_vue_dynamic(struct gs_compile_context
*gcc
)
359 /* TODO similar to the static version */
362 * When SO is enabled and the inputs are lines or triangles, vertices are
363 * always buffered. we can defer the emission of the current vertex until
364 * the next EMIT or ENDPRIM. Or, we can emit two URB_WRITEs with the later
365 * patching the former.
370 gs_lower_opcode_emit_so_dynamic(struct gs_compile_context
*gcc
)
372 struct toy_compiler
*tc
= &gcc
->tc
;
374 tc_IF(tc
, tdst_null(),
375 tsrc_from(gcc
->dynamic_data
.num_vertices_in_prim
),
376 tsrc_imm_d(gcc
->out_vue_min_count
),
380 tc_ADD(tc
, gcc
->vars
.tmp
, tsrc_from(gcc
->vars
.so_index
), tsrc_imm_d(0x03020100));
382 /* TODO same as static version */
387 tc_ADD(tc
, gcc
->vars
.so_index
,
388 tsrc_from(gcc
->vars
.so_index
), tsrc_imm_d(gcc
->out_vue_min_count
));
392 gs_lower_opcode_emit_vue_static(struct gs_compile_context
*gcc
)
394 struct toy_compiler
*tc
= &gcc
->tc
;
395 struct toy_inst
*inst2
;
398 eot
= (gcc
->static_data
.num_vertices
== gcc
->static_data
.total_vertices
);
401 ((gcc
->static_data
.last_vertex
[(gcc
->static_data
.num_vertices
- 1) / 32] &
402 1 << ((gcc
->static_data
.num_vertices
- 1) % 32)) != 0);
404 if (eot
&& gcc
->write_so
) {
405 inst2
= tc_OR(tc
, tdst_offset(gcc
->vars
.urb_write_header
, 0, 2),
406 tsrc_from(gcc
->vars
.so_written
),
407 tsrc_imm_d(gcc
->vars
.prim_type
<< 2 |
408 gcc
->vars
.prim_start
<< 1 |
409 gcc
->vars
.prim_end
));
410 inst2
->exec_size
= BRW_EXECUTE_1
;
411 inst2
->src
[0] = tsrc_rect(inst2
->src
[0], TOY_RECT_010
);
412 inst2
->src
[1] = tsrc_rect(inst2
->src
[1], TOY_RECT_010
);
415 gs_COPY1(tc
, gcc
->vars
.urb_write_header
, 2,
416 tsrc_imm_d(gcc
->vars
.prim_type
<< 2 |
417 gcc
->vars
.prim_start
<< 1 |
418 gcc
->vars
.prim_end
), 0);
421 gs_write_vue(gcc
, tdst_d(gcc
->vars
.tmp
),
422 tsrc_from(gcc
->vars
.urb_write_header
),
424 gcc
->shader
->out
.count
, eot
);
427 gs_COPY1(tc
, gcc
->vars
.urb_write_header
, 0,
428 tsrc_from(tdst_d(gcc
->vars
.tmp
)), 0);
431 gcc
->vars
.prim_start
= gcc
->vars
.prim_end
;
432 gcc
->vars
.prim_end
= false;
436 gs_lower_opcode_emit_so_static(struct gs_compile_context
*gcc
)
438 struct toy_compiler
*tc
= &gcc
->tc
;
439 struct toy_inst
*inst
;
442 if (gcc
->static_data
.num_vertices_in_prim
< gcc
->out_vue_min_count
)
445 inst
= tc_MOV(tc
, tdst_w(gcc
->vars
.tmp
), tsrc_imm_v(0x03020100));
446 inst
->exec_size
= BRW_EXECUTE_8
;
447 inst
->mask_ctrl
= BRW_MASK_DISABLE
;
449 tc_ADD(tc
, tdst_d(gcc
->vars
.tmp
), tsrc_from(tdst_d(gcc
->vars
.tmp
)),
450 tsrc_rect(tsrc_from(gcc
->vars
.so_index
), TOY_RECT_010
));
452 tc_IF(tc
, tdst_null(),
453 tsrc_rect(tsrc_offset(tsrc_from(tdst_d(gcc
->vars
.tmp
)), 0, gcc
->out_vue_min_count
- 1), TOY_RECT_010
),
454 tsrc_rect(tsrc_offset(gcc
->payload
.svbi
, 0, 4), TOY_RECT_010
),
457 for (i
= 0; i
< gcc
->out_vue_min_count
; i
++) {
458 for (j
= 0; j
< gcc
->so_info
->num_outputs
; j
++) {
459 const int idx
= gcc
->so_info
->output
[j
].register_index
;
460 struct toy_src index
, out
;
461 int binding_table_index
;
464 index
= tsrc_d(tsrc_offset(tsrc_from(gcc
->vars
.tmp
), 0, i
));
466 if (i
== gcc
->out_vue_min_count
- 1) {
467 out
= gcc
->vars
.tgsi_outs
[idx
];
470 /* gcc->vars.buffer_cur also points to the first vertex */
472 (gcc
->vars
.buffer_cur
+ i
) % gcc
->vars
.buffer_needed
;
474 out
= tsrc_offset(tsrc_from(gcc
->vars
.buffers
[buf
]), idx
, 0);
477 out
= tsrc_offset(out
, 0, gcc
->so_info
->output
[j
].start_component
);
480 * From the Sandy Bridge PRM, volume 4 part 2, page 19:
482 * "The Kernel must do a write commit on the last write to DAP
483 * prior to a URB_WRITE with End of Thread."
486 (gcc
->static_data
.num_vertices
== gcc
->static_data
.total_vertices
&&
487 i
== gcc
->out_vue_min_count
- 1 &&
488 j
== gcc
->so_info
->num_outputs
- 1);
491 binding_table_index
= ILO_GS_SO_SURFACE(j
);
493 gs_write_so(gcc
, gcc
->vars
.tmp
, index
,
494 out
, write_commit
, binding_table_index
);
497 * From the Sandy Bridge PRM, volume 4 part 1, page 168:
499 * "The write commit does not modify the destination register, but
500 * merely clears the dependency associated with the destination
501 * register. Thus, a simple "mov" instruction using the register as a
502 * source is sufficient to wait for the write commit to occur."
505 tc_MOV(tc
, gcc
->vars
.tmp
, tsrc_from(gcc
->vars
.tmp
));
509 /* SONumPrimsWritten occupies the higher word of m0.2 of URB_WRITE */
510 tc_ADD(tc
, gcc
->vars
.so_written
,
511 tsrc_from(gcc
->vars
.so_written
), tsrc_imm_d(1 << 16));
512 tc_ADD(tc
, gcc
->vars
.so_index
,
513 tsrc_from(gcc
->vars
.so_index
), tsrc_imm_d(gcc
->out_vue_min_count
));
519 gs_lower_opcode_emit_static(struct gs_compile_context
*gcc
,
520 struct toy_inst
*inst
)
522 gcc
->static_data
.num_vertices
++;
523 gcc
->static_data
.num_vertices_in_prim
++;
526 gs_lower_opcode_emit_so_static(gcc
);
528 if (gcc
->out_vue_min_count
> 1 &&
529 gcc
->static_data
.num_vertices
!= gcc
->static_data
.total_vertices
)
530 gs_save_output(gcc
, gcc
->vars
.tgsi_outs
);
534 gs_lower_opcode_emit_vue_static(gcc
);
538 gs_lower_opcode_emit_dynamic(struct gs_compile_context
*gcc
,
539 struct toy_inst
*inst
)
541 struct toy_compiler
*tc
= &gcc
->tc
;
543 tc_ADD(tc
, gcc
->dynamic_data
.num_vertices
,
544 tsrc_from(gcc
->dynamic_data
.num_vertices
), tsrc_imm_d(1));
545 tc_ADD(tc
, gcc
->dynamic_data
.num_vertices_in_prim
,
546 tsrc_from(gcc
->dynamic_data
.num_vertices_in_prim
), tsrc_imm_d(1));
549 gs_lower_opcode_emit_so_dynamic(gcc
);
551 if (gcc
->out_vue_min_count
> 1)
552 gs_save_output(gcc
, gcc
->vars
.tgsi_outs
);
556 gs_lower_opcode_emit_vue_dynamic(gcc
);
560 gs_lower_opcode_emit(struct gs_compile_context
*gcc
, struct toy_inst
*inst
)
563 gs_lower_opcode_emit_static(gcc
, inst
);
565 gs_lower_opcode_emit_dynamic(gcc
, inst
);
569 gs_lower_opcode_tgsi_in(struct gs_compile_context
*gcc
,
570 struct toy_dst dst
, int dim
, int idx
)
572 struct toy_compiler
*tc
= &gcc
->tc
;
574 int slot
, reg
= -1, subreg
;
576 slot
= toy_tgsi_find_input(&gcc
->tgsi
, idx
);
580 for (i
= 0; i
< gcc
->variant
->u
.gs
.num_inputs
; i
++) {
581 if (gcc
->variant
->u
.gs
.semantic_names
[i
] ==
582 gcc
->tgsi
.inputs
[slot
].semantic_name
&&
583 gcc
->variant
->u
.gs
.semantic_indices
[i
] ==
584 gcc
->tgsi
.inputs
[slot
].semantic_index
) {
586 subreg
= (i
% 2) * 4;
593 tc_MOV(tc
, dst
, tsrc_imm_f(0.0f
));
597 /* fix vertex ordering for _3DPRIM_TRISTRIP_REVERSE */
598 if (gcc
->in_vue_count
== 3 && dim
< 2) {
599 struct toy_inst
*inst
;
602 inst
= tc_AND(tc
, tdst_d(gcc
->vars
.tmp
),
603 tsrc_offset(gcc
->payload
.header
, 0, 2), tsrc_imm_d(0x1f));
604 inst
->exec_size
= BRW_EXECUTE_1
;
605 inst
->src
[0] = tsrc_rect(inst
->src
[0], TOY_RECT_010
);
606 inst
->src
[1] = tsrc_rect(inst
->src
[1], TOY_RECT_010
);
608 inst
= tc_CMP(tc
, tdst_null(), tsrc_from(tdst_d(gcc
->vars
.tmp
)),
609 tsrc_imm_d(_3DPRIM_TRISTRIP_REVERSE
), BRW_CONDITIONAL_NEQ
);
610 inst
->src
[0] = tsrc_rect(inst
->src
[0], TOY_RECT_010
);
612 attr
= tsrc_offset(gcc
->payload
.vues
[dim
], reg
, subreg
);
613 inst
= tc_MOV(tc
, dst
, attr
);
614 inst
->pred_ctrl
= BRW_PREDICATE_NORMAL
;
616 /* swap IN[0] and IN[1] for _3DPRIM_TRISTRIP_REVERSE */
619 attr
= tsrc_offset(gcc
->payload
.vues
[dim
], reg
, subreg
);
620 inst
= tc_MOV(tc
, dst
, attr
);
621 inst
->pred_ctrl
= BRW_PREDICATE_NORMAL
;
622 inst
->pred_inv
= true;
625 attr
= tsrc_offset(gcc
->payload
.vues
[dim
], reg
, subreg
);
626 tc_MOV(tc
, dst
, attr
);
633 gs_lower_opcode_tgsi_imm(struct gs_compile_context
*gcc
,
634 struct toy_dst dst
, int idx
)
639 imm
= toy_tgsi_get_imm(&gcc
->tgsi
, idx
, NULL
);
641 for (ch
= 0; ch
< 4; ch
++) {
642 struct toy_inst
*inst
;
645 inst
= tc_MOV(&gcc
->tc
,
646 tdst_writemask(tdst_ud(dst
), 1 << ch
),
647 tsrc_imm_ud(imm
[ch
]));
648 inst
->access_mode
= BRW_ALIGN_16
;
653 gs_lower_opcode_tgsi_direct(struct gs_compile_context
*gcc
,
654 struct toy_inst
*inst
)
656 struct toy_compiler
*tc
= &gcc
->tc
;
659 assert(inst
->src
[0].file
== TOY_FILE_IMM
);
660 dim
= inst
->src
[0].val32
;
662 assert(inst
->src
[1].file
== TOY_FILE_IMM
);
663 idx
= inst
->src
[1].val32
;
665 switch (inst
->opcode
) {
666 case TOY_OPCODE_TGSI_IN
:
667 gs_lower_opcode_tgsi_in(gcc
, inst
->dst
, dim
, idx
);
668 /* fetch all dimensions */
672 for (i
= 1; i
< gcc
->in_vue_count
; i
++) {
673 const int vrf
= toy_tgsi_get_vrf(&gcc
->tgsi
, TGSI_FILE_INPUT
, i
, idx
);
679 dst
= tdst(TOY_FILE_VRF
, vrf
, 0);
680 gs_lower_opcode_tgsi_in(gcc
, dst
, i
, idx
);
684 case TOY_OPCODE_TGSI_IMM
:
686 gs_lower_opcode_tgsi_imm(gcc
, inst
->dst
, idx
);
688 case TOY_OPCODE_TGSI_CONST
:
689 case TOY_OPCODE_TGSI_SV
:
691 tc_fail(tc
, "unhandled TGSI fetch");
695 tc_discard_inst(tc
, inst
);
699 gs_lower_virtual_opcodes(struct gs_compile_context
*gcc
)
701 struct toy_compiler
*tc
= &gcc
->tc
;
702 struct toy_inst
*inst
;
705 while ((inst
= tc_next(tc
)) != NULL
) {
706 switch (inst
->opcode
) {
707 case TOY_OPCODE_TGSI_IN
:
708 case TOY_OPCODE_TGSI_CONST
:
709 case TOY_OPCODE_TGSI_SV
:
710 case TOY_OPCODE_TGSI_IMM
:
711 gs_lower_opcode_tgsi_direct(gcc
, inst
);
713 case TOY_OPCODE_TGSI_INDIRECT_FETCH
:
714 case TOY_OPCODE_TGSI_INDIRECT_STORE
:
715 /* TODO similar to VS */
716 tc_fail(tc
, "no indirection support");
717 tc_discard_inst(tc
, inst
);
719 case TOY_OPCODE_TGSI_TEX
:
720 case TOY_OPCODE_TGSI_TXB
:
721 case TOY_OPCODE_TGSI_TXD
:
722 case TOY_OPCODE_TGSI_TXL
:
723 case TOY_OPCODE_TGSI_TXP
:
724 case TOY_OPCODE_TGSI_TXF
:
725 case TOY_OPCODE_TGSI_TXQ
:
726 case TOY_OPCODE_TGSI_TXQ_LZ
:
727 case TOY_OPCODE_TGSI_TEX2
:
728 case TOY_OPCODE_TGSI_TXB2
:
729 case TOY_OPCODE_TGSI_TXL2
:
730 case TOY_OPCODE_TGSI_SAMPLE
:
731 case TOY_OPCODE_TGSI_SAMPLE_I
:
732 case TOY_OPCODE_TGSI_SAMPLE_I_MS
:
733 case TOY_OPCODE_TGSI_SAMPLE_B
:
734 case TOY_OPCODE_TGSI_SAMPLE_C
:
735 case TOY_OPCODE_TGSI_SAMPLE_C_LZ
:
736 case TOY_OPCODE_TGSI_SAMPLE_D
:
737 case TOY_OPCODE_TGSI_SAMPLE_L
:
738 case TOY_OPCODE_TGSI_GATHER4
:
739 case TOY_OPCODE_TGSI_SVIEWINFO
:
740 case TOY_OPCODE_TGSI_SAMPLE_POS
:
741 case TOY_OPCODE_TGSI_SAMPLE_INFO
:
742 /* TODO similar to VS */
743 tc_fail(tc
, "no sampling support");
744 tc_discard_inst(tc
, inst
);
746 case TOY_OPCODE_EMIT
:
747 gs_lower_opcode_emit(gcc
, inst
);
748 tc_discard_inst(tc
, inst
);
750 case TOY_OPCODE_ENDPRIM
:
751 gs_lower_opcode_endprim(gcc
, inst
);
752 tc_discard_inst(tc
, inst
);
760 while ((inst
= tc_next(tc
)) != NULL
) {
761 switch (inst
->opcode
) {
765 case TOY_OPCODE_SQRT
:
769 case TOY_OPCODE_FDIV
:
771 case TOY_OPCODE_INT_DIV_QUOTIENT
:
772 case TOY_OPCODE_INT_DIV_REMAINDER
:
773 toy_compiler_lower_math(tc
, inst
);
775 case TOY_OPCODE_URB_WRITE
:
776 toy_compiler_lower_to_send(tc
, inst
, false, BRW_SFID_URB
);
779 if (inst
->opcode
> 127)
780 tc_fail(tc
, "unhandled virtual opcode");
787 * Get the number of (tessellated) primitives generated by this shader.
788 * Return false if that is unknown until runtime.
791 get_num_prims_static(struct gs_compile_context
*gcc
)
793 struct toy_compiler
*tc
= &gcc
->tc
;
794 const struct toy_inst
*inst
;
795 int num_vertices_in_prim
= 0, if_depth
= 0, do_depth
= 0;
796 bool is_static
= true;
799 while ((inst
= tc_next_no_skip(tc
)) != NULL
) {
800 switch (inst
->opcode
) {
804 case BRW_OPCODE_ENDIF
:
810 case BRW_OPCODE_WHILE
:
813 case TOY_OPCODE_EMIT
:
814 if (if_depth
|| do_depth
) {
818 gcc
->static_data
.total_vertices
++;
820 num_vertices_in_prim
++;
821 if (num_vertices_in_prim
>= gcc
->out_vue_min_count
)
822 gcc
->static_data
.total_prims
++;
825 case TOY_OPCODE_ENDPRIM
:
826 if (if_depth
|| do_depth
) {
830 const int vertidx
= gcc
->static_data
.total_vertices
- 1;
831 const int idx
= vertidx
/ 32;
832 const int subidx
= vertidx
% 32;
834 gcc
->static_data
.last_vertex
[idx
] |= 1 << subidx
;
835 num_vertices_in_prim
= 0;
846 gcc
->is_static
= is_static
;
850 * Compile the shader.
853 gs_compile(struct gs_compile_context
*gcc
)
855 struct toy_compiler
*tc
= &gcc
->tc
;
856 struct ilo_shader
*sh
= gcc
->shader
;
858 get_num_prims_static(gcc
);
860 if (gcc
->is_static
) {
864 gs_ff_sync(gcc
, tdst_d(gcc
->vars
.tmp
), tsrc_imm_d(gcc
->static_data
.total_prims
));
865 gs_COPY1(tc
, gcc
->vars
.urb_write_header
, 0, tsrc_from(tdst_d(gcc
->vars
.tmp
)), 0);
867 gs_COPY4(tc
, gcc
->vars
.so_index
, 0, tsrc_from(tdst_d(gcc
->vars
.tmp
)), 1);
872 tc_fail(tc
, "no control flow support");
879 gs_lower_virtual_opcodes(gcc
);
880 toy_compiler_legalize_for_ra(tc
);
881 toy_compiler_optimize(tc
);
882 toy_compiler_allocate_registers(tc
,
886 toy_compiler_legalize_for_asm(tc
);
889 ilo_err("failed to legalize GS instructions: %s\n", tc
->reason
);
893 if (ilo_debug
& ILO_DEBUG_GS
) {
894 ilo_printf("legalized instructions:\n");
895 toy_compiler_dump(tc
);
899 sh
->kernel
= toy_compiler_assemble(tc
, &sh
->kernel_size
);
903 if (ilo_debug
& ILO_DEBUG_GS
) {
904 ilo_printf("disassembly:\n");
905 toy_compiler_disassemble(tc
, sh
->kernel
, sh
->kernel_size
);
913 gs_compile_passthrough(struct gs_compile_context
*gcc
)
915 struct toy_compiler
*tc
= &gcc
->tc
;
916 struct ilo_shader
*sh
= gcc
->shader
;
918 gcc
->is_static
= true;
919 gcc
->static_data
.total_vertices
= gcc
->in_vue_count
;
920 gcc
->static_data
.total_prims
= 1;
921 gcc
->static_data
.last_vertex
[0] = 1 << (gcc
->in_vue_count
- 1);
924 gs_ff_sync(gcc
, tdst_d(gcc
->vars
.tmp
), tsrc_imm_d(gcc
->static_data
.total_prims
));
925 gs_COPY1(tc
, gcc
->vars
.urb_write_header
, 0, tsrc_from(tdst_d(gcc
->vars
.tmp
)), 0);
927 gs_COPY4(tc
, gcc
->vars
.so_index
, 0, tsrc_from(tdst_d(gcc
->vars
.tmp
)), 1);
932 for (vert
= 0; vert
< gcc
->out_vue_min_count
; vert
++) {
933 for (attr
= 0; attr
< gcc
->shader
->out
.count
; attr
++) {
934 tc_MOV(tc
, tdst_from(gcc
->vars
.tgsi_outs
[attr
]),
935 tsrc_offset(gcc
->payload
.vues
[vert
], attr
/ 2, (attr
% 2) * 4));
938 gs_lower_opcode_emit(gcc
, NULL
);
941 gs_lower_opcode_endprim(gcc
, NULL
);
947 gs_lower_virtual_opcodes(gcc
);
949 toy_compiler_legalize_for_ra(tc
);
950 toy_compiler_optimize(tc
);
951 toy_compiler_allocate_registers(tc
,
956 toy_compiler_legalize_for_asm(tc
);
959 ilo_err("failed to translate GS TGSI tokens: %s\n", tc
->reason
);
963 if (ilo_debug
& ILO_DEBUG_GS
) {
966 ilo_printf("VUE count %d, VUE size %d\n",
967 gcc
->in_vue_count
, gcc
->in_vue_size
);
968 ilo_printf("%srasterizer discard\n",
969 (gcc
->variant
->u
.gs
.rasterizer_discard
) ? "" : "no ");
971 for (i
= 0; i
< gcc
->so_info
->num_outputs
; i
++) {
972 ilo_printf("SO[%d] = OUT[%d]\n", i
,
973 gcc
->so_info
->output
[i
].register_index
);
976 ilo_printf("legalized instructions:\n");
977 toy_compiler_dump(tc
);
981 sh
->kernel
= toy_compiler_assemble(tc
, &sh
->kernel_size
);
983 ilo_err("failed to compile GS: %s\n", tc
->reason
);
987 if (ilo_debug
& ILO_DEBUG_GS
) {
988 ilo_printf("disassembly:\n");
989 toy_compiler_disassemble(tc
, sh
->kernel
, sh
->kernel_size
);
997 * Translate the TGSI tokens.
1000 gs_setup_tgsi(struct toy_compiler
*tc
, const struct tgsi_token
*tokens
,
1001 struct toy_tgsi
*tgsi
)
1003 if (ilo_debug
& ILO_DEBUG_GS
) {
1004 ilo_printf("dumping geometry shader\n");
1007 tgsi_dump(tokens
, 0);
1011 toy_compiler_translate_tgsi(tc
, tokens
, true, tgsi
);
1015 if (ilo_debug
& ILO_DEBUG_GS
) {
1016 ilo_printf("TGSI translator:\n");
1017 toy_tgsi_dump(tgsi
);
1019 toy_compiler_dump(tc
);
1027 * Set up shader inputs for fixed-function units.
1030 gs_setup_shader_in(struct ilo_shader
*sh
,
1031 const struct ilo_shader_variant
*variant
)
1035 for (i
= 0; i
< variant
->u
.gs
.num_inputs
; i
++) {
1036 sh
->in
.semantic_names
[i
] = variant
->u
.gs
.semantic_names
[i
];
1037 sh
->in
.semantic_indices
[i
] = variant
->u
.gs
.semantic_indices
[i
];
1038 sh
->in
.interp
[i
] = TGSI_INTERPOLATE_CONSTANT
;
1039 sh
->in
.centroid
[i
] = false;
1042 sh
->in
.count
= variant
->u
.gs
.num_inputs
;
1044 sh
->in
.has_pos
= false;
1045 sh
->in
.has_linear_interp
= false;
1046 sh
->in
.barycentric_interpolation_mode
= 0;
1050 * Set up shader outputs for fixed-function units.
1052 * XXX share the code with VS
1055 gs_setup_shader_out(struct ilo_shader
*sh
, const struct toy_tgsi
*tgsi
,
1056 bool output_clipdist
, int *output_map
)
1058 int psize_slot
= -1, pos_slot
= -1;
1059 int clipdist_slot
[2] = { -1, -1 };
1060 int color_slot
[4] = { -1, -1, -1, -1 };
1063 /* find out the slots of outputs that need special care */
1064 for (i
= 0; i
< tgsi
->num_outputs
; i
++) {
1065 switch (tgsi
->outputs
[i
].semantic_name
) {
1066 case TGSI_SEMANTIC_PSIZE
:
1069 case TGSI_SEMANTIC_POSITION
:
1072 case TGSI_SEMANTIC_CLIPDIST
:
1073 if (tgsi
->outputs
[i
].semantic_index
)
1074 clipdist_slot
[1] = i
;
1076 clipdist_slot
[0] = i
;
1078 case TGSI_SEMANTIC_COLOR
:
1079 if (tgsi
->outputs
[i
].semantic_index
)
1084 case TGSI_SEMANTIC_BCOLOR
:
1085 if (tgsi
->outputs
[i
].semantic_index
)
1095 /* the first two VUEs are always PSIZE and POSITION */
1097 sh
->out
.semantic_names
[0] = TGSI_SEMANTIC_PSIZE
;
1098 sh
->out
.semantic_indices
[0] = 0;
1099 sh
->out
.semantic_names
[1] = TGSI_SEMANTIC_POSITION
;
1100 sh
->out
.semantic_indices
[1] = 0;
1102 sh
->out
.has_pos
= true;
1103 output_map
[0] = psize_slot
;
1104 output_map
[1] = pos_slot
;
1106 /* followed by optional clip distances */
1107 if (output_clipdist
) {
1108 sh
->out
.semantic_names
[num_outs
] = TGSI_SEMANTIC_CLIPDIST
;
1109 sh
->out
.semantic_indices
[num_outs
] = 0;
1110 output_map
[num_outs
++] = clipdist_slot
[0];
1112 sh
->out
.semantic_names
[num_outs
] = TGSI_SEMANTIC_CLIPDIST
;
1113 sh
->out
.semantic_indices
[num_outs
] = 1;
1114 output_map
[num_outs
++] = clipdist_slot
[1];
1118 * make BCOLOR follow COLOR so that we can make use of
1119 * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING in 3DSTATE_SF
1121 for (i
= 0; i
< 4; i
++) {
1122 const int slot
= color_slot
[i
];
1127 sh
->out
.semantic_names
[num_outs
] = tgsi
->outputs
[slot
].semantic_name
;
1128 sh
->out
.semantic_indices
[num_outs
] = tgsi
->outputs
[slot
].semantic_index
;
1130 output_map
[num_outs
++] = slot
;
1133 /* add the rest of the outputs */
1134 for (i
= 0; i
< tgsi
->num_outputs
; i
++) {
1135 switch (tgsi
->outputs
[i
].semantic_name
) {
1136 case TGSI_SEMANTIC_PSIZE
:
1137 case TGSI_SEMANTIC_POSITION
:
1138 case TGSI_SEMANTIC_CLIPDIST
:
1139 case TGSI_SEMANTIC_COLOR
:
1140 case TGSI_SEMANTIC_BCOLOR
:
1143 sh
->out
.semantic_names
[num_outs
] = tgsi
->outputs
[i
].semantic_name
;
1144 sh
->out
.semantic_indices
[num_outs
] = tgsi
->outputs
[i
].semantic_index
;
1145 output_map
[num_outs
++] = i
;
1150 sh
->out
.count
= num_outs
;
1154 gs_setup_vars(struct gs_compile_context
*gcc
)
1156 int grf
= gcc
->first_free_grf
;
1159 gcc
->vars
.urb_write_header
= tdst_d(tdst(TOY_FILE_GRF
, grf
, 0));
1162 gcc
->vars
.tmp
= tdst(TOY_FILE_GRF
, grf
, 0);
1165 if (gcc
->write_so
) {
1166 gcc
->vars
.buffer_needed
= gcc
->out_vue_min_count
- 1;
1167 for (i
= 0; i
< gcc
->vars
.buffer_needed
; i
++) {
1168 gcc
->vars
.buffers
[i
] = tdst(TOY_FILE_GRF
, grf
, 0);
1169 grf
+= gcc
->shader
->out
.count
;
1172 gcc
->vars
.so_written
= tdst_d(tdst(TOY_FILE_GRF
, grf
, 0));
1175 gcc
->vars
.so_index
= tdst_d(tdst(TOY_FILE_GRF
, grf
, 0));
1179 gcc
->first_free_grf
= grf
;
1181 if (!gcc
->tgsi
.reg_mapping
) {
1182 for (i
= 0; i
< gcc
->shader
->out
.count
; i
++)
1183 gcc
->vars
.tgsi_outs
[i
] = tsrc(TOY_FILE_GRF
, grf
++, 0);
1185 gcc
->first_free_grf
= grf
;
1189 for (i
= 0; i
< gcc
->shader
->out
.count
; i
++) {
1190 const int slot
= gcc
->output_map
[i
];
1191 const int vrf
= (slot
>= 0) ? toy_tgsi_get_vrf(&gcc
->tgsi
,
1192 TGSI_FILE_OUTPUT
, 0, gcc
->tgsi
.outputs
[slot
].index
) : -1;
1195 gcc
->vars
.tgsi_outs
[i
] = tsrc(TOY_FILE_VRF
, vrf
, 0);
1197 gcc
->vars
.tgsi_outs
[i
] = (i
== 0) ? tsrc_imm_d(0) : tsrc_imm_f(0.0f
);
1202 gs_setup_payload(struct gs_compile_context
*gcc
)
1208 /* r0: payload header */
1209 gcc
->payload
.header
= tsrc_d(tsrc(TOY_FILE_GRF
, grf
, 0));
1213 if (gcc
->write_so
) {
1214 gcc
->payload
.svbi
= tsrc_ud(tsrc(TOY_FILE_GRF
, grf
, 0));
1219 gcc
->shader
->in
.start_grf
= grf
;
1221 /* no pull constants */
1224 for (i
= 0; i
< gcc
->in_vue_count
; i
++) {
1225 gcc
->payload
.vues
[i
] = tsrc(TOY_FILE_GRF
, grf
, 0);
1226 grf
+= gcc
->in_vue_size
;
1229 gcc
->first_free_grf
= grf
;
1230 gcc
->last_free_grf
= 127;
1234 * Set up GS compile context. This includes translating the TGSI tokens.
1237 gs_setup(struct gs_compile_context
*gcc
,
1238 const struct ilo_shader_state
*state
,
1239 const struct ilo_shader_variant
*variant
,
1242 memset(gcc
, 0, sizeof(*gcc
));
1244 gcc
->shader
= CALLOC_STRUCT(ilo_shader
);
1248 gcc
->variant
= variant
;
1249 gcc
->so_info
= &state
->info
.stream_output
;
1251 toy_compiler_init(&gcc
->tc
, state
->info
.dev
);
1253 gcc
->write_so
= (state
->info
.stream_output
.num_outputs
> 0);
1254 gcc
->write_vue
= !gcc
->variant
->u
.gs
.rasterizer_discard
;
1256 gcc
->tc
.templ
.access_mode
= BRW_ALIGN_16
;
1257 gcc
->tc
.templ
.exec_size
= BRW_EXECUTE_4
;
1258 gcc
->tc
.rect_linear_width
= 4;
1260 if (state
->info
.tokens
) {
1261 if (!gs_setup_tgsi(&gcc
->tc
, state
->info
.tokens
, &gcc
->tgsi
)) {
1262 toy_compiler_cleanup(&gcc
->tc
);
1267 switch (gcc
->tgsi
.props
.gs_input_prim
) {
1268 case PIPE_PRIM_POINTS
:
1269 gcc
->in_vue_count
= 1;
1271 case PIPE_PRIM_LINES
:
1272 gcc
->in_vue_count
= 2;
1273 gcc
->shader
->in
.discard_adj
= true;
1275 case PIPE_PRIM_TRIANGLES
:
1276 gcc
->in_vue_count
= 3;
1277 gcc
->shader
->in
.discard_adj
= true;
1279 case PIPE_PRIM_LINES_ADJACENCY
:
1280 gcc
->in_vue_count
= 4;
1282 case PIPE_PRIM_TRIANGLES_ADJACENCY
:
1283 gcc
->in_vue_count
= 6;
1286 tc_fail(&gcc
->tc
, "unsupported GS input type");
1287 gcc
->in_vue_count
= 0;
1291 switch (gcc
->tgsi
.props
.gs_output_prim
) {
1292 case PIPE_PRIM_POINTS
:
1293 gcc
->out_vue_min_count
= 1;
1295 case PIPE_PRIM_LINE_STRIP
:
1296 gcc
->out_vue_min_count
= 2;
1298 case PIPE_PRIM_TRIANGLE_STRIP
:
1299 gcc
->out_vue_min_count
= 3;
1302 tc_fail(&gcc
->tc
, "unsupported GS output type");
1303 gcc
->out_vue_min_count
= 0;
1310 gcc
->in_vue_count
= num_verts
;
1311 gcc
->out_vue_min_count
= num_verts
;
1313 gcc
->tgsi
.num_outputs
= gcc
->variant
->u
.gs
.num_inputs
;
1314 for (i
= 0; i
< gcc
->variant
->u
.gs
.num_inputs
; i
++) {
1315 gcc
->tgsi
.outputs
[i
].semantic_name
=
1316 gcc
->variant
->u
.gs
.semantic_names
[i
];
1317 gcc
->tgsi
.outputs
[i
].semantic_index
=
1318 gcc
->variant
->u
.gs
.semantic_indices
[i
];
1322 gcc
->tc
.templ
.access_mode
= BRW_ALIGN_1
;
1324 gs_setup_shader_in(gcc
->shader
, gcc
->variant
);
1325 gs_setup_shader_out(gcc
->shader
, &gcc
->tgsi
, false, gcc
->output_map
);
1327 gcc
->in_vue_size
= (gcc
->shader
->in
.count
+ 1) / 2;
1329 gcc
->out_vue_size
= (gcc
->shader
->out
.count
+ 1) / 2;
1331 gs_setup_payload(gcc
);
1334 /* m0 is reserved for system routines */
1335 gcc
->first_free_mrf
= 1;
1336 gcc
->last_free_mrf
= 15;
1342 * Compile the geometry shader.
1345 ilo_shader_compile_gs(const struct ilo_shader_state
*state
,
1346 const struct ilo_shader_variant
*variant
)
1348 struct gs_compile_context gcc
;
1350 if (!gs_setup(&gcc
, state
, variant
, 0))
1353 if (!gs_compile(&gcc
)) {
1358 toy_tgsi_cleanup(&gcc
.tgsi
);
1359 toy_compiler_cleanup(&gcc
.tc
);
1365 append_gs_to_vs(struct ilo_shader
*vs
, struct ilo_shader
*gs
, int num_verts
)
1373 /* kernels must be aligned to 64-byte */
1374 gs_offset
= align(vs
->kernel_size
, 64);
1375 combined
= REALLOC(vs
->kernel
, vs
->kernel_size
,
1376 gs_offset
+ gs
->kernel_size
);
1380 memcpy(combined
+ gs_offset
, gs
->kernel
, gs
->kernel_size
);
1382 vs
->kernel
= combined
;
1383 vs
->kernel_size
= gs_offset
+ gs
->kernel_size
;
1385 vs
->stream_output
= true;
1386 vs
->gs_offsets
[num_verts
- 1] = gs_offset
;
1387 vs
->gs_start_grf
= gs
->in
.start_grf
;
1389 ilo_shader_destroy(gs
);
1395 ilo_shader_compile_gs_passthrough(const struct ilo_shader_state
*vs_state
,
1396 const struct ilo_shader_variant
*vs_variant
,
1397 const int *so_mapping
,
1398 struct ilo_shader
*vs
)
1400 struct gs_compile_context gcc
;
1401 struct ilo_shader_state state
;
1402 struct ilo_shader_variant variant
;
1403 const int num_verts
= 3;
1406 /* init GS state and variant */
1408 state
.info
.tokens
= NULL
;
1409 for (i
= 0; i
< state
.info
.stream_output
.num_outputs
; i
++) {
1410 const int reg
= state
.info
.stream_output
.output
[i
].register_index
;
1412 state
.info
.stream_output
.output
[i
].register_index
= so_mapping
[reg
];
1415 variant
= *vs_variant
;
1416 variant
.u
.gs
.rasterizer_discard
= vs_variant
->u
.vs
.rasterizer_discard
;
1417 variant
.u
.gs
.num_inputs
= vs
->out
.count
;
1418 for (i
= 0; i
< vs
->out
.count
; i
++) {
1419 variant
.u
.gs
.semantic_names
[i
] =
1420 vs
->out
.semantic_names
[i
];
1421 variant
.u
.gs
.semantic_indices
[i
] =
1422 vs
->out
.semantic_indices
[i
];
1425 if (!gs_setup(&gcc
, &state
, &variant
, num_verts
))
1428 if (!gs_compile_passthrough(&gcc
)) {
1433 /* no need to call toy_tgsi_cleanup() */
1434 toy_compiler_cleanup(&gcc
.tc
);
1436 return append_gs_to_vs(vs
, gcc
.shader
, num_verts
);