2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
41 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
43 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
45 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
46 c
->prog_data
.total_grf
= c
->last_tmp
;
51 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
53 if (tmp
.nr
== c
->last_tmp
-1)
57 static void release_tmps( struct brw_vs_compile
*c
)
59 c
->last_tmp
= c
->first_tmp
;
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
68 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
70 GLuint i
, reg
= 0, mrf
;
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
77 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
78 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
79 c
->vp
->use_const_buffer
= GL_TRUE
;
81 c
->vp
->use_const_buffer
= GL_FALSE
;
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
85 /* r0 -- reserved as usual
87 c
->r0
= brw_vec8_grf(reg
, 0);
90 /* User clip planes from curbe:
92 if (c
->key
.nr_userclip
) {
93 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
94 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
97 /* Deal with curbe alignment:
99 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
102 /* Vertex program parameters from curbe:
104 if (c
->vp
->use_const_buffer
) {
105 /* get constants from a real constant buffer */
106 c
->prog_data
.curb_read_length
= 0;
107 c
->prog_data
.nr_params
= 4; /* XXX 0 causes a bug elsewhere... */
110 /* use a section of the GRF for constants */
111 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
112 for (i
= 0; i
< nr_params
; i
++) {
113 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
115 reg
+= (nr_params
+ 1) / 2;
116 c
->prog_data
.curb_read_length
= reg
- 1;
118 c
->prog_data
.nr_params
= nr_params
* 4;
121 /* Allocate input regs:
124 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
125 if (c
->prog_data
.inputs_read
& (1 << i
)) {
127 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
132 /* Allocate outputs. The non-position outputs go straight into message regs.
135 c
->first_output
= reg
;
136 c
->first_overflow_output
= 0;
138 if (BRW_IS_IGDNG(c
->func
.brw
))
143 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
144 if (c
->prog_data
.outputs_written
& (1 << i
)) {
146 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
147 if (i
== VERT_RESULT_HPOS
) {
148 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
151 else if (i
== VERT_RESULT_PSIZ
) {
152 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
154 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
158 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
162 /* too many vertex results to fit in MRF, use GRF for overflow */
163 if (!c
->first_overflow_output
)
164 c
->first_overflow_output
= i
;
165 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
172 /* Allocate program temporaries:
174 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
175 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
179 /* Address reg(s). Don't try to use the internal address reg until
182 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
183 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
187 BRW_VERTICAL_STRIDE_8
,
189 BRW_HORIZONTAL_STRIDE_1
,
195 if (c
->vp
->use_const_buffer
) {
196 for (i
= 0; i
< 3; i
++) {
197 c
->current_const
[i
].index
= -1;
198 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
203 for (i
= 0; i
< 128; i
++) {
204 if (c
->output_regs
[i
].used_in_src
) {
205 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
210 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
213 /* Some opcodes need an internal temporary:
216 c
->last_tmp
= reg
; /* for allocation purposes */
218 /* Each input reg holds data from two vertices. The
219 * urb_read_length is the number of registers read from *each*
220 * vertex urb, so is half the amount:
222 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
224 if (BRW_IS_IGDNG(c
->func
.brw
))
225 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+ 6 + 3) / 4;
227 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+ 2 + 3) / 4;
229 c
->prog_data
.total_grf
= reg
;
231 if (INTEL_DEBUG
& DEBUG_VS
) {
232 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
233 _mesa_printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
234 _mesa_printf("%s reg = %d\n", __FUNCTION__
, reg
);
240 * If an instruction uses a temp reg both as a src and the dest, we
241 * sometimes need to allocate an intermediate temporary.
243 static void unalias1( struct brw_vs_compile
*c
,
246 void (*func
)( struct brw_vs_compile
*,
250 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
251 struct brw_compile
*p
= &c
->func
;
252 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
254 brw_MOV(p
, dst
, tmp
);
264 * Checkes if 2-operand instruction needs an intermediate temporary.
266 static void unalias2( struct brw_vs_compile
*c
,
270 void (*func
)( struct brw_vs_compile
*,
275 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
276 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
277 struct brw_compile
*p
= &c
->func
;
278 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
279 func(c
, tmp
, arg0
, arg1
);
280 brw_MOV(p
, dst
, tmp
);
284 func(c
, dst
, arg0
, arg1
);
290 * Checkes if 3-operand instruction needs an intermediate temporary.
292 static void unalias3( struct brw_vs_compile
*c
,
297 void (*func
)( struct brw_vs_compile
*,
303 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
304 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
305 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
306 struct brw_compile
*p
= &c
->func
;
307 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
308 func(c
, tmp
, arg0
, arg1
, arg2
);
309 brw_MOV(p
, dst
, tmp
);
313 func(c
, dst
, arg0
, arg1
, arg2
);
317 static void emit_sop( struct brw_compile
*p
,
323 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
324 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
325 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
326 brw_set_predicate_control_flag_value(p
, 0xff);
329 static void emit_seq( struct brw_compile
*p
,
332 struct brw_reg arg1
)
334 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
337 static void emit_sne( struct brw_compile
*p
,
340 struct brw_reg arg1
)
342 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
344 static void emit_slt( struct brw_compile
*p
,
347 struct brw_reg arg1
)
349 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
352 static void emit_sle( struct brw_compile
*p
,
355 struct brw_reg arg1
)
357 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
360 static void emit_sgt( struct brw_compile
*p
,
363 struct brw_reg arg1
)
365 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
368 static void emit_sge( struct brw_compile
*p
,
371 struct brw_reg arg1
)
373 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
376 static void emit_max( struct brw_compile
*p
,
379 struct brw_reg arg1
)
381 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
382 brw_SEL(p
, dst
, arg1
, arg0
);
383 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
386 static void emit_min( struct brw_compile
*p
,
389 struct brw_reg arg1
)
391 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
392 brw_SEL(p
, dst
, arg0
, arg1
);
393 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
397 static void emit_math1( struct brw_vs_compile
*c
,
403 /* There are various odd behaviours with SEND on the simulator. In
404 * addition there are documented issues with the fact that the GEN4
405 * processor doesn't do dependency control properly on SEND
406 * results. So, on balance, this kludge to get around failures
407 * with writemasked math results looks like it might be necessary
408 * whether that turns out to be a simulator bug or not:
410 struct brw_compile
*p
= &c
->func
;
411 struct brw_reg tmp
= dst
;
412 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
413 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
421 BRW_MATH_SATURATE_NONE
,
424 BRW_MATH_DATA_SCALAR
,
428 brw_MOV(p
, dst
, tmp
);
434 static void emit_math2( struct brw_vs_compile
*c
,
441 struct brw_compile
*p
= &c
->func
;
442 struct brw_reg tmp
= dst
;
443 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
444 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
449 brw_MOV(p
, brw_message_reg(3), arg1
);
454 BRW_MATH_SATURATE_NONE
,
457 BRW_MATH_DATA_SCALAR
,
461 brw_MOV(p
, dst
, tmp
);
467 static void emit_exp_noalias( struct brw_vs_compile
*c
,
469 struct brw_reg arg0
)
471 struct brw_compile
*p
= &c
->func
;
474 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
475 struct brw_reg tmp
= get_tmp(c
);
476 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
478 /* tmp_d = floor(arg0.x) */
479 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
481 /* result[0] = 2.0 ^ tmp */
483 /* Adjust exponent for floating point:
486 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
488 /* Install exponent and sign.
489 * Excess drops off the edge:
491 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
492 tmp_d
, brw_imm_d(23));
497 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
498 /* result[1] = arg0.x - floor(arg0.x) */
499 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
502 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
503 /* As with the LOG instruction, we might be better off just
504 * doing a taylor expansion here, seeing as we have to do all
507 * If mathbox partial precision is too low, consider also:
508 * result[3] = result[0] * EXP(result[1])
511 BRW_MATH_FUNCTION_EXP
,
512 brw_writemask(dst
, WRITEMASK_Z
),
513 brw_swizzle1(arg0
, 0),
514 BRW_MATH_PRECISION_FULL
);
517 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
518 /* result[3] = 1.0; */
519 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
524 static void emit_log_noalias( struct brw_vs_compile
*c
,
526 struct brw_reg arg0
)
528 struct brw_compile
*p
= &c
->func
;
529 struct brw_reg tmp
= dst
;
530 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
531 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
532 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
533 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
537 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
540 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
543 * These almost look likey they could be joined up, but not really
546 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
547 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
549 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
551 brw_writemask(tmp_ud
, WRITEMASK_X
),
552 brw_swizzle1(arg0_ud
, 0),
553 brw_imm_ud((1U<<31)-1));
556 brw_writemask(tmp_ud
, WRITEMASK_X
),
561 brw_writemask(tmp
, WRITEMASK_X
),
562 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
566 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
568 brw_writemask(tmp_ud
, WRITEMASK_Y
),
569 brw_swizzle1(arg0_ud
, 0),
570 brw_imm_ud((1<<23)-1));
573 brw_writemask(tmp_ud
, WRITEMASK_Y
),
575 brw_imm_ud(127<<23));
578 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
579 /* result[2] = result[0] + LOG2(result[1]); */
581 /* Why bother? The above is just a hint how to do this with a
582 * taylor series. Maybe we *should* use a taylor series as by
583 * the time all the above has been done it's almost certainly
584 * quicker than calling the mathbox, even with low precision.
587 * - result[0] + mathbox.LOG2(result[1])
588 * - mathbox.LOG2(arg0.x)
589 * - result[0] + inline_taylor_approx(result[1])
592 BRW_MATH_FUNCTION_LOG
,
593 brw_writemask(tmp
, WRITEMASK_Z
),
594 brw_swizzle1(tmp
, 1),
595 BRW_MATH_PRECISION_FULL
);
598 brw_writemask(tmp
, WRITEMASK_Z
),
599 brw_swizzle1(tmp
, 2),
600 brw_swizzle1(tmp
, 0));
603 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
604 /* result[3] = 1.0; */
605 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
609 brw_MOV(p
, dst
, tmp
);
615 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
617 static void emit_dst_noalias( struct brw_vs_compile
*c
,
622 struct brw_compile
*p
= &c
->func
;
624 /* There must be a better way to do this:
626 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
627 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
628 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
629 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
630 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
631 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
632 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
633 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
637 static void emit_xpd( struct brw_compile
*p
,
642 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
643 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
647 static void emit_lit_noalias( struct brw_vs_compile
*c
,
649 struct brw_reg arg0
)
651 struct brw_compile
*p
= &c
->func
;
652 struct brw_instruction
*if_insn
;
653 struct brw_reg tmp
= dst
;
654 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
659 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
660 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
662 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
663 * to get all channels active inside the IF. In the clipping code
664 * we run with NoMask, so it's not an option and we can use
665 * BRW_EXECUTE_1 for all comparisions.
667 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
668 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
670 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
672 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
673 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
674 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
677 BRW_MATH_FUNCTION_POW
,
678 brw_writemask(dst
, WRITEMASK_Z
),
679 brw_swizzle1(tmp
, 2),
680 brw_swizzle1(arg0
, 3),
681 BRW_MATH_PRECISION_PARTIAL
);
684 brw_ENDIF(p
, if_insn
);
689 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
695 struct brw_compile
*p
= &c
->func
;
697 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
698 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
699 brw_MAC(p
, dst
, arg0
, arg1
);
702 /** 3 or 4-component vector normalization */
703 static void emit_nrm( struct brw_vs_compile
*c
,
708 struct brw_compile
*p
= &c
->func
;
709 struct brw_reg tmp
= get_tmp(c
);
711 /* tmp = dot(arg0, arg0) */
713 brw_DP3(p
, tmp
, arg0
, arg0
);
715 brw_DP4(p
, tmp
, arg0
, arg0
);
717 /* tmp = 1 / sqrt(tmp) */
718 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
720 /* dst = arg0 * tmp */
721 brw_MUL(p
, dst
, arg0
, tmp
);
727 static struct brw_reg
728 get_constant(struct brw_vs_compile
*c
,
729 const struct prog_instruction
*inst
,
732 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
733 struct brw_compile
*p
= &c
->func
;
734 struct brw_reg const_reg
;
735 struct brw_reg const2_reg
;
736 const GLboolean relAddr
= src
->RelAddr
;
738 assert(argIndex
< 3);
740 if (c
->current_const
[argIndex
].index
!= src
->Index
|| relAddr
) {
741 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
743 c
->current_const
[argIndex
].index
= src
->Index
;
746 printf(" fetch const[%d] for arg %d into reg %d\n",
747 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
749 /* need to fetch the constant now */
751 c
->current_const
[argIndex
].reg
,/* writeback dest */
753 relAddr
, /* relative indexing? */
754 addrReg
, /* address register */
755 16 * src
->Index
, /* byte offset */
756 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
761 const2_reg
= get_tmp(c
);
763 /* use upper half of address reg for second read */
764 addrReg
= stride(addrReg
, 0, 4, 0);
768 const2_reg
, /* writeback dest */
770 relAddr
, /* relative indexing? */
771 addrReg
, /* address register */
772 16 * src
->Index
, /* byte offset */
773 SURF_INDEX_VERT_CONST_BUFFER
778 const_reg
= c
->current_const
[argIndex
].reg
;
781 /* merge the two Owords into the constant register */
782 /* const_reg[7..4] = const2_reg[7..4] */
784 suboffset(stride(const_reg
, 0, 4, 1), 4),
785 suboffset(stride(const2_reg
, 0, 4, 1), 4));
786 release_tmp(c
, const2_reg
);
789 /* replicate lower four floats into upper half (to get XYZWXYZW) */
790 const_reg
= stride(const_reg
, 0, 4, 0);
799 /* TODO: relative addressing!
801 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
802 gl_register_file file
,
806 case PROGRAM_TEMPORARY
:
809 assert(c
->regs
[file
][index
].nr
!= 0);
810 return c
->regs
[file
][index
];
811 case PROGRAM_STATE_VAR
:
812 case PROGRAM_CONSTANT
:
813 case PROGRAM_UNIFORM
:
814 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
815 return c
->regs
[PROGRAM_STATE_VAR
][index
];
816 case PROGRAM_ADDRESS
:
818 return c
->regs
[file
][index
];
820 case PROGRAM_UNDEFINED
: /* undef values */
821 return brw_null_reg();
823 case PROGRAM_LOCAL_PARAM
:
824 case PROGRAM_ENV_PARAM
:
825 case PROGRAM_WRITE_ONLY
:
828 return brw_null_reg();
834 * Indirect addressing: get reg[[arg] + offset].
836 static struct brw_reg
deref( struct brw_vs_compile
*c
,
840 struct brw_compile
*p
= &c
->func
;
841 struct brw_reg tmp
= vec4(get_tmp(c
));
842 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
843 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
844 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
845 struct brw_reg indirect
= brw_vec4_indirect(0,0);
848 brw_push_insn_state(p
);
849 brw_set_access_mode(p
, BRW_ALIGN_1
);
851 /* This is pretty clunky - load the address register twice and
852 * fetch each 4-dword value in turn. There must be a way to do
853 * this in a single pass, but I couldn't get it to work.
855 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
856 brw_MOV(p
, tmp
, indirect
);
858 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
859 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
861 brw_pop_insn_state(p
);
864 /* NOTE: tmp not released */
870 * Get brw reg corresponding to the instruction's [argIndex] src reg.
871 * TODO: relative addressing!
873 static struct brw_reg
874 get_src_reg( struct brw_vs_compile
*c
,
875 const struct prog_instruction
*inst
,
878 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
879 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
880 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
883 case PROGRAM_TEMPORARY
:
887 return deref(c
, c
->regs
[file
][0], index
);
890 assert(c
->regs
[file
][index
].nr
!= 0);
891 return c
->regs
[file
][index
];
894 case PROGRAM_STATE_VAR
:
895 case PROGRAM_CONSTANT
:
896 case PROGRAM_UNIFORM
:
897 if (c
->vp
->use_const_buffer
) {
898 return get_constant(c
, inst
, argIndex
);
901 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
904 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
905 return c
->regs
[PROGRAM_STATE_VAR
][index
];
907 case PROGRAM_ADDRESS
:
909 return c
->regs
[file
][index
];
911 case PROGRAM_UNDEFINED
:
912 /* this is a normal case since we loop over all three src args */
913 return brw_null_reg();
915 case PROGRAM_LOCAL_PARAM
:
916 case PROGRAM_ENV_PARAM
:
917 case PROGRAM_WRITE_ONLY
:
920 return brw_null_reg();
925 static void emit_arl( struct brw_vs_compile
*c
,
927 struct brw_reg arg0
)
929 struct brw_compile
*p
= &c
->func
;
930 struct brw_reg tmp
= dst
;
931 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
936 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
937 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
945 * Return the brw reg for the given instruction's src argument.
946 * Will return mangled results for SWZ op. The emit_swz() function
947 * ignores this result and recalculates taking extended swizzles into
950 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
951 const struct prog_instruction
*inst
,
954 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
957 if (src
->File
== PROGRAM_UNDEFINED
)
958 return brw_null_reg();
960 reg
= get_src_reg(c
, inst
, argIndex
);
962 /* Convert 3-bit swizzle to 2-bit.
964 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
965 GET_SWZ(src
->Swizzle
, 1),
966 GET_SWZ(src
->Swizzle
, 2),
967 GET_SWZ(src
->Swizzle
, 3));
969 /* Note this is ok for non-swizzle instructions:
971 reg
.negate
= src
->Negate
? 1 : 0;
978 * Get brw register for the given program dest register.
980 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
981 struct prog_dst_register dst
)
986 case PROGRAM_TEMPORARY
:
988 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
989 reg
= c
->regs
[dst
.File
][dst
.Index
];
991 case PROGRAM_ADDRESS
:
992 assert(dst
.Index
== 0);
993 reg
= c
->regs
[dst
.File
][dst
.Index
];
995 case PROGRAM_UNDEFINED
:
996 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
997 reg
= brw_null_reg();
1001 reg
= brw_null_reg();
1004 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1010 static void emit_swz( struct brw_vs_compile
*c
,
1012 const struct prog_instruction
*inst
)
1014 const GLuint argIndex
= 0;
1015 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1016 struct brw_compile
*p
= &c
->func
;
1017 GLuint zeros_mask
= 0;
1018 GLuint ones_mask
= 0;
1019 GLuint src_mask
= 0;
1021 GLboolean need_tmp
= (src
.Negate
&&
1022 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1023 struct brw_reg tmp
= dst
;
1029 for (i
= 0; i
< 4; i
++) {
1030 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1031 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1050 /* Do src first, in case dst aliases src:
1053 struct brw_reg arg0
;
1055 arg0
= get_src_reg(c
, inst
, argIndex
);
1057 arg0
= brw_swizzle(arg0
,
1058 src_swz
[0], src_swz
[1],
1059 src_swz
[2], src_swz
[3]);
1061 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1065 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1068 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1071 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1074 brw_MOV(p
, dst
, tmp
);
1075 release_tmp(c
, tmp
);
1081 * Post-vertex-program processing. Send the results to the URB.
1083 static void emit_vertex_write( struct brw_vs_compile
*c
)
1085 struct brw_compile
*p
= &c
->func
;
1086 struct brw_reg m0
= brw_message_reg(0);
1087 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1090 GLuint len_vertext_header
= 2;
1092 if (c
->key
.copy_edgeflag
) {
1094 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1095 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1098 /* Build ndc coords */
1100 /* ndc = 1.0 / pos.w */
1101 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1102 /* ndc.xyz = pos * ndc */
1103 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1105 /* Update the header for point size, user clipping flags, and -ve rhw
1108 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
1109 c
->key
.nr_userclip
|| BRW_IS_965(p
->brw
))
1111 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1114 brw_MOV(p
, header1
, brw_imm_ud(0));
1116 brw_set_access_mode(p
, BRW_ALIGN_16
);
1118 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
1119 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1120 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1121 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1124 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1125 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1126 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1127 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1128 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1131 /* i965 clipping workaround:
1132 * 1) Test for -ve rhw
1134 * set ndc = (0,0,0,0)
1137 * Later, clipping will detect ucp[6] and ensure the primitive is
1138 * clipped against all fixed planes.
1140 if (BRW_IS_965(p
->brw
)) {
1142 vec8(brw_null_reg()),
1144 brw_swizzle1(ndc
, 3),
1147 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1148 brw_MOV(p
, ndc
, brw_imm_f(0));
1149 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1152 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1153 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1154 brw_set_access_mode(p
, BRW_ALIGN_16
);
1156 release_tmp(c
, header1
);
1159 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1162 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1163 * of zeros followed by two sets of NDC coordinates:
1165 brw_set_access_mode(p
, BRW_ALIGN_1
);
1166 brw_MOV(p
, offset(m0
, 2), ndc
);
1168 if (BRW_IS_IGDNG(p
->brw
)) {
1169 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1170 brw_MOV(p
, offset(m0
, 3), pos
); /* a portion of vertex header */
1171 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1172 * Seems it is useless for us.
1173 * m6 is used for aligning, so that the remainder of vertex element is
1176 brw_MOV(p
, offset(m0
, 7), pos
); /* the remainder of vertex element */
1177 len_vertext_header
= 6;
1179 brw_MOV(p
, offset(m0
, 3), pos
);
1180 len_vertext_header
= 2;
1183 eot
= (c
->first_overflow_output
== 0);
1186 brw_null_reg(), /* dest */
1187 0, /* starting mrf reg nr */
1191 MIN2(c
->nr_outputs
+ 1 + len_vertext_header
, (BRW_MAX_MRF
-1)), /* msg len */
1192 0, /* response len */
1194 1, /* writes complete */
1195 0, /* urb destination offset */
1196 BRW_URB_SWIZZLE_INTERLEAVE
);
1198 if (c
->first_overflow_output
> 0) {
1199 /* Not all of the vertex outputs/results fit into the MRF.
1200 * Move the overflowed attributes from the GRF to the MRF and
1201 * issue another brw_urb_WRITE().
1203 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1207 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1208 if (c
->prog_data
.outputs_written
& (1 << i
)) {
1209 /* move from GRF to MRF */
1210 brw_MOV(p
, brw_message_reg(4+mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1216 brw_null_reg(), /* dest */
1217 4, /* starting mrf reg nr */
1221 mrf
+1, /* msg len */
1222 0, /* response len */
1224 1, /* writes complete */
1225 BRW_MAX_MRF
-1, /* urb destination offset */
1226 BRW_URB_SWIZZLE_INTERLEAVE
);
1232 * Called after code generation to resolve subroutine calls and the
1234 * \param end_inst points to brw code for END instruction
1235 * \param last_inst points to last instruction emitted before vertex write
1238 post_vs_emit( struct brw_vs_compile
*c
,
1239 struct brw_instruction
*end_inst
,
1240 struct brw_instruction
*last_inst
)
1244 brw_resolve_cals(&c
->func
);
1246 /* patch up the END code to jump past subroutines, etc */
1247 offset
= last_inst
- end_inst
;
1248 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1252 /* Emit the vertex program instructions here.
1254 void brw_vs_emit(struct brw_vs_compile
*c
)
1256 #define MAX_IF_DEPTH 32
1257 #define MAX_LOOP_DEPTH 32
1258 struct brw_compile
*p
= &c
->func
;
1259 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1260 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1261 GLuint end_offset
= 0;
1262 struct brw_instruction
*end_inst
, *last_inst
;
1263 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1264 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1268 if (INTEL_DEBUG
& DEBUG_VS
) {
1269 _mesa_printf("vs-emit:\n");
1270 _mesa_print_program(&c
->vp
->program
.Base
);
1274 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1275 brw_set_access_mode(p
, BRW_ALIGN_16
);
1277 /* Message registers can't be read, so copy the output into GRF register
1278 if they are used in source registers */
1279 for (insn
= 0; insn
< nr_insns
; insn
++) {
1281 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1282 for (i
= 0; i
< 3; i
++) {
1283 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1284 GLuint index
= src
->Index
;
1285 GLuint file
= src
->File
;
1286 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1287 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1291 /* Static register allocation
1293 brw_vs_alloc_regs(c
);
1294 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1296 for (insn
= 0; insn
< nr_insns
; insn
++) {
1298 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1299 struct brw_reg args
[3], dst
;
1303 printf("%d: ", insn
);
1304 _mesa_print_instruction(inst
);
1307 /* Get argument regs. SWZ is special and does this itself.
1309 if (inst
->Opcode
!= OPCODE_SWZ
)
1310 for (i
= 0; i
< 3; i
++) {
1311 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1314 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1315 args
[i
] = c
->output_regs
[index
].reg
;
1317 args
[i
] = get_arg(c
, inst
, i
);
1320 /* Get dest regs. Note that it is possible for a reg to be both
1321 * dst and arg, given the static allocation of registers. So
1322 * care needs to be taken emitting multi-operation instructions.
1324 index
= inst
->DstReg
.Index
;
1325 file
= inst
->DstReg
.File
;
1326 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1327 dst
= c
->output_regs
[index
].reg
;
1329 dst
= get_dst(c
, inst
->DstReg
);
1331 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1332 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1333 inst
->SaturateMode
);
1336 switch (inst
->Opcode
) {
1338 brw_MOV(p
, dst
, brw_abs(args
[0]));
1341 brw_ADD(p
, dst
, args
[0], args
[1]);
1344 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1347 brw_DP3(p
, dst
, args
[0], args
[1]);
1350 brw_DP4(p
, dst
, args
[0], args
[1]);
1353 brw_DPH(p
, dst
, args
[0], args
[1]);
1356 emit_nrm(c
, dst
, args
[0], 3);
1359 emit_nrm(c
, dst
, args
[0], 4);
1362 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1365 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1368 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1371 emit_arl(c
, dst
, args
[0]);
1374 brw_RNDD(p
, dst
, args
[0]);
1377 brw_FRC(p
, dst
, args
[0]);
1380 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1383 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1386 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1389 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1392 brw_MOV(p
, brw_acc_reg(), args
[2]);
1393 brw_MAC(p
, dst
, args
[0], args
[1]);
1396 emit_max(p
, dst
, args
[0], args
[1]);
1399 emit_min(p
, dst
, args
[0], args
[1]);
1402 brw_MOV(p
, dst
, args
[0]);
1405 brw_MUL(p
, dst
, args
[0], args
[1]);
1408 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1411 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1414 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1418 emit_seq(p
, dst
, args
[0], args
[1]);
1421 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1424 emit_sne(p
, dst
, args
[0], args
[1]);
1427 emit_sge(p
, dst
, args
[0], args
[1]);
1430 emit_sgt(p
, dst
, args
[0], args
[1]);
1433 emit_slt(p
, dst
, args
[0], args
[1]);
1436 emit_sle(p
, dst
, args
[0], args
[1]);
1439 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1442 /* The args[0] value can't be used here as it won't have
1443 * correctly encoded the full swizzle:
1445 emit_swz(c
, dst
, inst
);
1448 /* round toward zero */
1449 brw_RNDZ(p
, dst
, args
[0]);
1452 emit_xpd(p
, dst
, args
[0], args
[1]);
1455 assert(if_depth
< MAX_IF_DEPTH
);
1456 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
1459 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1462 assert(if_depth
> 0);
1463 brw_ENDIF(p
, if_inst
[--if_depth
]);
1466 case OPCODE_BGNLOOP
:
1467 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1471 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1475 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1477 case OPCODE_ENDLOOP
:
1479 struct brw_instruction
*inst0
, *inst1
;
1481 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1482 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1483 while (inst0
> loop_inst
[loop_depth
]) {
1485 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
1486 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
+ 1;
1487 inst0
->bits3
.if_else
.pop_count
= 0;
1489 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
1490 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
;
1491 inst0
->bits3
.if_else
.pop_count
= 0;
1501 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1502 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1503 brw_set_predicate_control_flag_value(p
, 0xff);
1506 brw_set_access_mode(p
, BRW_ALIGN_1
);
1507 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1508 brw_set_access_mode(p
, BRW_ALIGN_16
);
1509 brw_ADD(p
, get_addr_reg(stack_index
),
1510 get_addr_reg(stack_index
), brw_imm_d(4));
1511 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1512 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1515 brw_ADD(p
, get_addr_reg(stack_index
),
1516 get_addr_reg(stack_index
), brw_imm_d(-4));
1517 brw_set_access_mode(p
, BRW_ALIGN_1
);
1518 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1519 brw_set_access_mode(p
, BRW_ALIGN_16
);
1522 end_offset
= p
->nr_insn
;
1523 /* this instruction will get patched later to jump past subroutine
1526 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1532 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1538 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1539 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1540 _mesa_opcode_string(inst
->Opcode
) :
1544 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1545 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1546 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1547 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1550 /* Result color clamping.
1552 * When destination register is an output register and
1553 * it's primary/secondary front/back color, we have to clamp
1554 * the result to [0,1]. This is done by enabling the
1555 * saturation bit for the last instruction.
1557 * We don't use brw_set_saturate() as it modifies
1558 * p->current->header.saturate, which affects all the subsequent
1559 * instructions. Instead, we directly modify the header
1560 * of the last (already stored) instruction.
1562 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1563 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1564 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1565 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1566 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1567 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1574 end_inst
= &p
->store
[end_offset
];
1575 last_inst
= &p
->store
[p
->nr_insn
];
1577 /* The END instruction will be patched to jump to this code */
1578 emit_vertex_write(c
);
1580 post_vs_emit(c
, end_inst
, last_inst
);