2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
41 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
43 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
45 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
46 c
->prog_data
.total_grf
= c
->last_tmp
;
51 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
53 if (tmp
.nr
== c
->last_tmp
-1)
57 static void release_tmps( struct brw_vs_compile
*c
)
59 c
->last_tmp
= c
->first_tmp
;
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
68 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
70 GLuint i
, reg
= 0, mrf
;
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
77 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
78 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
79 c
->vp
->use_const_buffer
= GL_TRUE
;
81 c
->vp
->use_const_buffer
= GL_FALSE
;
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
85 /* r0 -- reserved as usual
87 c
->r0
= brw_vec8_grf(reg
, 0);
90 /* User clip planes from curbe:
92 if (c
->key
.nr_userclip
) {
93 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
94 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
97 /* Deal with curbe alignment:
99 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
102 /* Vertex program parameters from curbe:
104 if (c
->vp
->use_const_buffer
) {
105 /* get constants from a real constant buffer */
106 c
->prog_data
.curb_read_length
= 0;
107 c
->prog_data
.nr_params
= 4; /* XXX 0 causes a bug elsewhere... */
110 /* use a section of the GRF for constants */
111 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
112 for (i
= 0; i
< nr_params
; i
++) {
113 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
115 reg
+= (nr_params
+ 1) / 2;
116 c
->prog_data
.curb_read_length
= reg
- 1;
118 c
->prog_data
.nr_params
= nr_params
* 4;
121 /* Allocate input regs:
124 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
125 if (c
->prog_data
.inputs_read
& (1 << i
)) {
127 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
132 /* Allocate outputs. The non-position outputs go straight into message regs.
135 c
->first_output
= reg
;
137 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
138 if (c
->prog_data
.outputs_written
& (1 << i
)) {
140 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
141 if (i
== VERT_RESULT_HPOS
) {
142 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
145 else if (i
== VERT_RESULT_PSIZ
) {
146 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
148 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
151 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
157 /* Allocate program temporaries:
159 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
160 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
164 /* Address reg(s). Don't try to use the internal address reg until
167 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
168 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
172 BRW_VERTICAL_STRIDE_8
,
174 BRW_HORIZONTAL_STRIDE_1
,
180 if (c
->vp
->use_const_buffer
) {
181 for (i
= 0; i
< 3; i
++) {
182 c
->current_const
[i
].index
= -1;
183 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
188 for (i
= 0; i
< 128; i
++) {
189 if (c
->output_regs
[i
].used_in_src
) {
190 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
195 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
198 /* Some opcodes need an internal temporary:
201 c
->last_tmp
= reg
; /* for allocation purposes */
203 /* Each input reg holds data from two vertices. The
204 * urb_read_length is the number of registers read from *each*
205 * vertex urb, so is half the amount:
207 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
209 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+ 2 + 3) / 4;
210 c
->prog_data
.total_grf
= reg
;
212 if (INTEL_DEBUG
& DEBUG_VS
) {
213 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
214 _mesa_printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
215 _mesa_printf("%s reg = %d\n", __FUNCTION__
, reg
);
221 * If an instruction uses a temp reg both as a src and the dest, we
222 * sometimes need to allocate an intermediate temporary.
224 static void unalias1( struct brw_vs_compile
*c
,
227 void (*func
)( struct brw_vs_compile
*,
231 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
232 struct brw_compile
*p
= &c
->func
;
233 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
235 brw_MOV(p
, dst
, tmp
);
245 * Checkes if 2-operand instruction needs an intermediate temporary.
247 static void unalias2( struct brw_vs_compile
*c
,
251 void (*func
)( struct brw_vs_compile
*,
256 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
257 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
258 struct brw_compile
*p
= &c
->func
;
259 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
260 func(c
, tmp
, arg0
, arg1
);
261 brw_MOV(p
, dst
, tmp
);
265 func(c
, dst
, arg0
, arg1
);
271 * Checkes if 3-operand instruction needs an intermediate temporary.
273 static void unalias3( struct brw_vs_compile
*c
,
278 void (*func
)( struct brw_vs_compile
*,
284 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
285 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
286 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
287 struct brw_compile
*p
= &c
->func
;
288 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
289 func(c
, tmp
, arg0
, arg1
, arg2
);
290 brw_MOV(p
, dst
, tmp
);
294 func(c
, dst
, arg0
, arg1
, arg2
);
298 static void emit_sop( struct brw_compile
*p
,
304 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
305 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
306 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
307 brw_set_predicate_control_flag_value(p
, 0xff);
310 static void emit_seq( struct brw_compile
*p
,
313 struct brw_reg arg1
)
315 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
318 static void emit_sne( struct brw_compile
*p
,
321 struct brw_reg arg1
)
323 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
325 static void emit_slt( struct brw_compile
*p
,
328 struct brw_reg arg1
)
330 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
333 static void emit_sle( struct brw_compile
*p
,
336 struct brw_reg arg1
)
338 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
341 static void emit_sgt( struct brw_compile
*p
,
344 struct brw_reg arg1
)
346 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
349 static void emit_sge( struct brw_compile
*p
,
352 struct brw_reg arg1
)
354 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
357 static void emit_max( struct brw_compile
*p
,
360 struct brw_reg arg1
)
362 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
363 brw_SEL(p
, dst
, arg1
, arg0
);
364 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
367 static void emit_min( struct brw_compile
*p
,
370 struct brw_reg arg1
)
372 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
373 brw_SEL(p
, dst
, arg0
, arg1
);
374 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
378 static void emit_math1( struct brw_vs_compile
*c
,
384 /* There are various odd behaviours with SEND on the simulator. In
385 * addition there are documented issues with the fact that the GEN4
386 * processor doesn't do dependency control properly on SEND
387 * results. So, on balance, this kludge to get around failures
388 * with writemasked math results looks like it might be necessary
389 * whether that turns out to be a simulator bug or not:
391 struct brw_compile
*p
= &c
->func
;
392 struct brw_reg tmp
= dst
;
393 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
394 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
402 BRW_MATH_SATURATE_NONE
,
405 BRW_MATH_DATA_SCALAR
,
409 brw_MOV(p
, dst
, tmp
);
415 static void emit_math2( struct brw_vs_compile
*c
,
422 struct brw_compile
*p
= &c
->func
;
423 struct brw_reg tmp
= dst
;
424 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
425 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
430 brw_MOV(p
, brw_message_reg(3), arg1
);
435 BRW_MATH_SATURATE_NONE
,
438 BRW_MATH_DATA_SCALAR
,
442 brw_MOV(p
, dst
, tmp
);
448 static void emit_exp_noalias( struct brw_vs_compile
*c
,
450 struct brw_reg arg0
)
452 struct brw_compile
*p
= &c
->func
;
455 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
456 struct brw_reg tmp
= get_tmp(c
);
457 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
459 /* tmp_d = floor(arg0.x) */
460 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
462 /* result[0] = 2.0 ^ tmp */
464 /* Adjust exponent for floating point:
467 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
469 /* Install exponent and sign.
470 * Excess drops off the edge:
472 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
473 tmp_d
, brw_imm_d(23));
478 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
479 /* result[1] = arg0.x - floor(arg0.x) */
480 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
483 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
484 /* As with the LOG instruction, we might be better off just
485 * doing a taylor expansion here, seeing as we have to do all
488 * If mathbox partial precision is too low, consider also:
489 * result[3] = result[0] * EXP(result[1])
492 BRW_MATH_FUNCTION_EXP
,
493 brw_writemask(dst
, WRITEMASK_Z
),
494 brw_swizzle1(arg0
, 0),
495 BRW_MATH_PRECISION_FULL
);
498 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
499 /* result[3] = 1.0; */
500 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
505 static void emit_log_noalias( struct brw_vs_compile
*c
,
507 struct brw_reg arg0
)
509 struct brw_compile
*p
= &c
->func
;
510 struct brw_reg tmp
= dst
;
511 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
512 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
513 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
514 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
518 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
521 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
524 * These almost look likey they could be joined up, but not really
527 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
528 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
530 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
532 brw_writemask(tmp_ud
, WRITEMASK_X
),
533 brw_swizzle1(arg0_ud
, 0),
534 brw_imm_ud((1U<<31)-1));
537 brw_writemask(tmp_ud
, WRITEMASK_X
),
542 brw_writemask(tmp
, WRITEMASK_X
),
543 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
547 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
549 brw_writemask(tmp_ud
, WRITEMASK_Y
),
550 brw_swizzle1(arg0_ud
, 0),
551 brw_imm_ud((1<<23)-1));
554 brw_writemask(tmp_ud
, WRITEMASK_Y
),
556 brw_imm_ud(127<<23));
559 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
560 /* result[2] = result[0] + LOG2(result[1]); */
562 /* Why bother? The above is just a hint how to do this with a
563 * taylor series. Maybe we *should* use a taylor series as by
564 * the time all the above has been done it's almost certainly
565 * quicker than calling the mathbox, even with low precision.
568 * - result[0] + mathbox.LOG2(result[1])
569 * - mathbox.LOG2(arg0.x)
570 * - result[0] + inline_taylor_approx(result[1])
573 BRW_MATH_FUNCTION_LOG
,
574 brw_writemask(tmp
, WRITEMASK_Z
),
575 brw_swizzle1(tmp
, 1),
576 BRW_MATH_PRECISION_FULL
);
579 brw_writemask(tmp
, WRITEMASK_Z
),
580 brw_swizzle1(tmp
, 2),
581 brw_swizzle1(tmp
, 0));
584 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
585 /* result[3] = 1.0; */
586 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
590 brw_MOV(p
, dst
, tmp
);
596 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
598 static void emit_dst_noalias( struct brw_vs_compile
*c
,
603 struct brw_compile
*p
= &c
->func
;
605 /* There must be a better way to do this:
607 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
608 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
609 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
610 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
611 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
612 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
613 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
614 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
618 static void emit_xpd( struct brw_compile
*p
,
623 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
624 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
628 static void emit_lit_noalias( struct brw_vs_compile
*c
,
630 struct brw_reg arg0
)
632 struct brw_compile
*p
= &c
->func
;
633 struct brw_instruction
*if_insn
;
634 struct brw_reg tmp
= dst
;
635 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
640 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
641 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
643 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
644 * to get all channels active inside the IF. In the clipping code
645 * we run with NoMask, so it's not an option and we can use
646 * BRW_EXECUTE_1 for all comparisions.
648 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
649 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
651 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
653 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
654 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
655 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
658 BRW_MATH_FUNCTION_POW
,
659 brw_writemask(dst
, WRITEMASK_Z
),
660 brw_swizzle1(tmp
, 2),
661 brw_swizzle1(arg0
, 3),
662 BRW_MATH_PRECISION_PARTIAL
);
665 brw_ENDIF(p
, if_insn
);
670 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
676 struct brw_compile
*p
= &c
->func
;
678 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
679 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
680 brw_MAC(p
, dst
, arg0
, arg1
);
683 /** 3 or 4-component vector normalization */
684 static void emit_nrm( struct brw_vs_compile
*c
,
689 struct brw_compile
*p
= &c
->func
;
690 struct brw_reg tmp
= get_tmp(c
);
692 /* tmp = dot(arg0, arg0) */
694 brw_DP3(p
, tmp
, arg0
, arg0
);
696 brw_DP4(p
, tmp
, arg0
, arg0
);
698 /* tmp = 1 / sqrt(tmp) */
699 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
701 /* dst = arg0 * tmp */
702 brw_MUL(p
, dst
, arg0
, tmp
);
708 static struct brw_reg
709 get_constant(struct brw_vs_compile
*c
,
710 const struct prog_instruction
*inst
,
713 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
714 struct brw_compile
*p
= &c
->func
;
715 struct brw_reg const_reg
;
716 struct brw_reg const2_reg
;
717 const GLboolean relAddr
= src
->RelAddr
;
719 assert(argIndex
< 3);
721 if (c
->current_const
[argIndex
].index
!= src
->Index
|| relAddr
) {
722 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
724 c
->current_const
[argIndex
].index
= src
->Index
;
727 printf(" fetch const[%d] for arg %d into reg %d\n",
728 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
730 /* need to fetch the constant now */
732 c
->current_const
[argIndex
].reg
,/* writeback dest */
734 relAddr
, /* relative indexing? */
735 addrReg
, /* address register */
736 16 * src
->Index
, /* byte offset */
737 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
742 const2_reg
= get_tmp(c
);
744 /* use upper half of address reg for second read */
745 addrReg
= stride(addrReg
, 0, 4, 0);
749 const2_reg
, /* writeback dest */
751 relAddr
, /* relative indexing? */
752 addrReg
, /* address register */
753 16 * src
->Index
, /* byte offset */
754 SURF_INDEX_VERT_CONST_BUFFER
759 const_reg
= c
->current_const
[argIndex
].reg
;
762 /* merge the two Owords into the constant register */
763 /* const_reg[7..4] = const2_reg[7..4] */
765 suboffset(stride(const_reg
, 0, 4, 1), 4),
766 suboffset(stride(const2_reg
, 0, 4, 1), 4));
767 release_tmp(c
, const2_reg
);
770 /* replicate lower four floats into upper half (to get XYZWXYZW) */
771 const_reg
= stride(const_reg
, 0, 4, 0);
780 /* TODO: relative addressing!
782 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
783 gl_register_file file
,
787 case PROGRAM_TEMPORARY
:
790 assert(c
->regs
[file
][index
].nr
!= 0);
791 return c
->regs
[file
][index
];
792 case PROGRAM_STATE_VAR
:
793 case PROGRAM_CONSTANT
:
794 case PROGRAM_UNIFORM
:
795 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
796 return c
->regs
[PROGRAM_STATE_VAR
][index
];
797 case PROGRAM_ADDRESS
:
799 return c
->regs
[file
][index
];
801 case PROGRAM_UNDEFINED
: /* undef values */
802 return brw_null_reg();
804 case PROGRAM_LOCAL_PARAM
:
805 case PROGRAM_ENV_PARAM
:
806 case PROGRAM_WRITE_ONLY
:
809 return brw_null_reg();
815 * Indirect addressing: get reg[[arg] + offset].
817 static struct brw_reg
deref( struct brw_vs_compile
*c
,
821 struct brw_compile
*p
= &c
->func
;
822 struct brw_reg tmp
= vec4(get_tmp(c
));
823 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
824 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
825 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
826 struct brw_reg indirect
= brw_vec4_indirect(0,0);
829 brw_push_insn_state(p
);
830 brw_set_access_mode(p
, BRW_ALIGN_1
);
832 /* This is pretty clunky - load the address register twice and
833 * fetch each 4-dword value in turn. There must be a way to do
834 * this in a single pass, but I couldn't get it to work.
836 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
837 brw_MOV(p
, tmp
, indirect
);
839 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
840 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
842 brw_pop_insn_state(p
);
845 /* NOTE: tmp not released */
851 * Get brw reg corresponding to the instruction's [argIndex] src reg.
852 * TODO: relative addressing!
854 static struct brw_reg
855 get_src_reg( struct brw_vs_compile
*c
,
856 const struct prog_instruction
*inst
,
859 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
860 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
861 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
864 case PROGRAM_TEMPORARY
:
868 return deref(c
, c
->regs
[file
][0], index
);
871 assert(c
->regs
[file
][index
].nr
!= 0);
872 return c
->regs
[file
][index
];
875 case PROGRAM_STATE_VAR
:
876 case PROGRAM_CONSTANT
:
877 case PROGRAM_UNIFORM
:
878 if (c
->vp
->use_const_buffer
) {
879 return get_constant(c
, inst
, argIndex
);
882 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
885 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
886 return c
->regs
[PROGRAM_STATE_VAR
][index
];
888 case PROGRAM_ADDRESS
:
890 return c
->regs
[file
][index
];
892 case PROGRAM_UNDEFINED
:
893 /* this is a normal case since we loop over all three src args */
894 return brw_null_reg();
896 case PROGRAM_LOCAL_PARAM
:
897 case PROGRAM_ENV_PARAM
:
898 case PROGRAM_WRITE_ONLY
:
901 return brw_null_reg();
906 static void emit_arl( struct brw_vs_compile
*c
,
908 struct brw_reg arg0
)
910 struct brw_compile
*p
= &c
->func
;
911 struct brw_reg tmp
= dst
;
912 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
917 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
918 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
926 * Return the brw reg for the given instruction's src argument.
927 * Will return mangled results for SWZ op. The emit_swz() function
928 * ignores this result and recalculates taking extended swizzles into
931 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
932 const struct prog_instruction
*inst
,
935 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
938 if (src
->File
== PROGRAM_UNDEFINED
)
939 return brw_null_reg();
941 reg
= get_src_reg(c
, inst
, argIndex
);
943 /* Convert 3-bit swizzle to 2-bit.
945 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
946 GET_SWZ(src
->Swizzle
, 1),
947 GET_SWZ(src
->Swizzle
, 2),
948 GET_SWZ(src
->Swizzle
, 3));
950 /* Note this is ok for non-swizzle instructions:
952 reg
.negate
= src
->Negate
? 1 : 0;
959 * Get brw register for the given program dest register.
961 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
962 struct prog_dst_register dst
)
967 case PROGRAM_TEMPORARY
:
969 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
970 reg
= c
->regs
[dst
.File
][dst
.Index
];
972 case PROGRAM_ADDRESS
:
973 assert(dst
.Index
== 0);
974 reg
= c
->regs
[dst
.File
][dst
.Index
];
976 case PROGRAM_UNDEFINED
:
977 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
978 reg
= brw_null_reg();
982 reg
= brw_null_reg();
985 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
991 static void emit_swz( struct brw_vs_compile
*c
,
993 const struct prog_instruction
*inst
)
995 const GLuint argIndex
= 0;
996 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
997 struct brw_compile
*p
= &c
->func
;
998 GLuint zeros_mask
= 0;
999 GLuint ones_mask
= 0;
1000 GLuint src_mask
= 0;
1002 GLboolean need_tmp
= (src
.Negate
&&
1003 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1004 struct brw_reg tmp
= dst
;
1010 for (i
= 0; i
< 4; i
++) {
1011 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1012 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1031 /* Do src first, in case dst aliases src:
1034 struct brw_reg arg0
;
1036 arg0
= get_src_reg(c
, inst
, argIndex
);
1038 arg0
= brw_swizzle(arg0
,
1039 src_swz
[0], src_swz
[1],
1040 src_swz
[2], src_swz
[3]);
1042 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1046 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1049 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1052 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1055 brw_MOV(p
, dst
, tmp
);
1056 release_tmp(c
, tmp
);
1062 * Post-vertex-program processing. Send the results to the URB.
1064 static void emit_vertex_write( struct brw_vs_compile
*c
)
1066 struct brw_compile
*p
= &c
->func
;
1067 struct brw_reg m0
= brw_message_reg(0);
1068 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1071 if (c
->key
.copy_edgeflag
) {
1073 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1074 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1077 /* Build ndc coords */
1079 /* ndc = 1.0 / pos.w */
1080 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1081 /* ndc.xyz = pos * ndc */
1082 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1084 /* Update the header for point size, user clipping flags, and -ve rhw
1087 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
1088 c
->key
.nr_userclip
|| !BRW_IS_G4X(p
->brw
))
1090 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1093 brw_MOV(p
, header1
, brw_imm_ud(0));
1095 brw_set_access_mode(p
, BRW_ALIGN_16
);
1097 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
1098 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1099 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1100 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1103 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1104 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1105 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1106 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1107 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1110 /* i965 clipping workaround:
1111 * 1) Test for -ve rhw
1113 * set ndc = (0,0,0,0)
1116 * Later, clipping will detect ucp[6] and ensure the primitive is
1117 * clipped against all fixed planes.
1119 if (!BRW_IS_G4X(p
->brw
)) {
1121 vec8(brw_null_reg()),
1123 brw_swizzle1(ndc
, 3),
1126 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1127 brw_MOV(p
, ndc
, brw_imm_f(0));
1128 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1131 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1132 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1133 brw_set_access_mode(p
, BRW_ALIGN_16
);
1135 release_tmp(c
, header1
);
1138 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1141 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1142 * of zeros followed by two sets of NDC coordinates:
1144 brw_set_access_mode(p
, BRW_ALIGN_1
);
1145 brw_MOV(p
, offset(m0
, 2), ndc
);
1146 brw_MOV(p
, offset(m0
, 3), pos
);
1149 brw_null_reg(), /* dest */
1150 0, /* starting mrf reg nr */
1154 c
->nr_outputs
+ 3, /* msg len */
1155 0, /* response len */
1157 1, /* writes complete */
1158 0, /* urb destination offset */
1159 BRW_URB_SWIZZLE_INTERLEAVE
);
1164 * Called after code generation to resolve subroutine calls and the
1166 * \param end_inst points to brw code for END instruction
1167 * \param last_inst points to last instruction emitted before vertex write
1170 post_vs_emit( struct brw_vs_compile
*c
,
1171 struct brw_instruction
*end_inst
,
1172 struct brw_instruction
*last_inst
)
1176 brw_resolve_cals(&c
->func
);
1178 /* patch up the END code to jump past subroutines, etc */
1179 offset
= last_inst
- end_inst
;
1180 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1184 /* Emit the vertex program instructions here.
1186 void brw_vs_emit(struct brw_vs_compile
*c
)
1188 #define MAX_IF_DEPTH 32
1189 #define MAX_LOOP_DEPTH 32
1190 struct brw_compile
*p
= &c
->func
;
1191 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1192 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1193 GLuint end_offset
= 0;
1194 struct brw_instruction
*end_inst
, *last_inst
;
1195 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1196 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1200 if (INTEL_DEBUG
& DEBUG_VS
) {
1201 _mesa_printf("vs-emit:\n");
1202 _mesa_print_program(&c
->vp
->program
.Base
);
1206 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1207 brw_set_access_mode(p
, BRW_ALIGN_16
);
1209 /* Message registers can't be read, so copy the output into GRF register
1210 if they are used in source registers */
1211 for (insn
= 0; insn
< nr_insns
; insn
++) {
1213 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1214 for (i
= 0; i
< 3; i
++) {
1215 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1216 GLuint index
= src
->Index
;
1217 GLuint file
= src
->File
;
1218 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1219 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1223 /* Static register allocation
1225 brw_vs_alloc_regs(c
);
1226 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1228 for (insn
= 0; insn
< nr_insns
; insn
++) {
1230 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1231 struct brw_reg args
[3], dst
;
1235 printf("%d: ", insn
);
1236 _mesa_print_instruction(inst
);
1239 /* Get argument regs. SWZ is special and does this itself.
1241 if (inst
->Opcode
!= OPCODE_SWZ
)
1242 for (i
= 0; i
< 3; i
++) {
1243 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1246 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1247 args
[i
] = c
->output_regs
[index
].reg
;
1249 args
[i
] = get_arg(c
, inst
, i
);
1252 /* Get dest regs. Note that it is possible for a reg to be both
1253 * dst and arg, given the static allocation of registers. So
1254 * care needs to be taken emitting multi-operation instructions.
1256 index
= inst
->DstReg
.Index
;
1257 file
= inst
->DstReg
.File
;
1258 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1259 dst
= c
->output_regs
[index
].reg
;
1261 dst
= get_dst(c
, inst
->DstReg
);
1263 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1264 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1265 inst
->SaturateMode
);
1268 switch (inst
->Opcode
) {
1270 brw_MOV(p
, dst
, brw_abs(args
[0]));
1273 brw_ADD(p
, dst
, args
[0], args
[1]);
1276 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1279 brw_DP3(p
, dst
, args
[0], args
[1]);
1282 brw_DP4(p
, dst
, args
[0], args
[1]);
1285 brw_DPH(p
, dst
, args
[0], args
[1]);
1288 emit_nrm(c
, dst
, args
[0], 3);
1291 emit_nrm(c
, dst
, args
[0], 4);
1294 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1297 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1300 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1303 emit_arl(c
, dst
, args
[0]);
1306 brw_RNDD(p
, dst
, args
[0]);
1309 brw_FRC(p
, dst
, args
[0]);
1312 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1315 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1318 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1321 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1324 brw_MOV(p
, brw_acc_reg(), args
[2]);
1325 brw_MAC(p
, dst
, args
[0], args
[1]);
1328 emit_max(p
, dst
, args
[0], args
[1]);
1331 emit_min(p
, dst
, args
[0], args
[1]);
1334 brw_MOV(p
, dst
, args
[0]);
1337 brw_MUL(p
, dst
, args
[0], args
[1]);
1340 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1343 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1346 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1350 emit_seq(p
, dst
, args
[0], args
[1]);
1353 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1356 emit_sne(p
, dst
, args
[0], args
[1]);
1359 emit_sge(p
, dst
, args
[0], args
[1]);
1362 emit_sgt(p
, dst
, args
[0], args
[1]);
1365 emit_slt(p
, dst
, args
[0], args
[1]);
1368 emit_sle(p
, dst
, args
[0], args
[1]);
1371 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1374 /* The args[0] value can't be used here as it won't have
1375 * correctly encoded the full swizzle:
1377 emit_swz(c
, dst
, inst
);
1380 /* round toward zero */
1381 brw_RNDZ(p
, dst
, args
[0]);
1384 emit_xpd(p
, dst
, args
[0], args
[1]);
1387 assert(if_depth
< MAX_IF_DEPTH
);
1388 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
1391 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1394 assert(if_depth
> 0);
1395 brw_ENDIF(p
, if_inst
[--if_depth
]);
1398 case OPCODE_BGNLOOP
:
1399 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1403 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1407 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1409 case OPCODE_ENDLOOP
:
1411 struct brw_instruction
*inst0
, *inst1
;
1413 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1414 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1415 while (inst0
> loop_inst
[loop_depth
]) {
1417 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
1418 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
+ 1;
1419 inst0
->bits3
.if_else
.pop_count
= 0;
1421 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
1422 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
;
1423 inst0
->bits3
.if_else
.pop_count
= 0;
1433 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1434 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1435 brw_set_predicate_control_flag_value(p
, 0xff);
1438 brw_set_access_mode(p
, BRW_ALIGN_1
);
1439 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1440 brw_set_access_mode(p
, BRW_ALIGN_16
);
1441 brw_ADD(p
, get_addr_reg(stack_index
),
1442 get_addr_reg(stack_index
), brw_imm_d(4));
1443 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1444 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1447 brw_ADD(p
, get_addr_reg(stack_index
),
1448 get_addr_reg(stack_index
), brw_imm_d(-4));
1449 brw_set_access_mode(p
, BRW_ALIGN_1
);
1450 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1451 brw_set_access_mode(p
, BRW_ALIGN_16
);
1454 end_offset
= p
->nr_insn
;
1455 /* this instruction will get patched later to jump past subroutine
1458 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1464 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1470 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1471 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1472 _mesa_opcode_string(inst
->Opcode
) :
1476 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1477 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1478 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1479 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1482 /* Result color clamping.
1484 * When destination register is an output register and
1485 * it's primary/secondary front/back color, we have to clamp
1486 * the result to [0,1]. This is done by enabling the
1487 * saturation bit for the last instruction.
1489 * We don't use brw_set_saturate() as it modifies
1490 * p->current->header.saturate, which affects all the subsequent
1491 * instructions. Instead, we directly modify the header
1492 * of the last (already stored) instruction.
1494 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1495 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1496 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1497 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1498 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1499 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1506 end_inst
= &p
->store
[end_offset
];
1507 last_inst
= &p
->store
[p
->nr_insn
];
1509 /* The END instruction will be patched to jump to this code */
1510 emit_vertex_write(c
);
1512 post_vs_emit(c
, end_inst
, last_inst
);