2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
41 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
43 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
45 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
46 c
->prog_data
.total_grf
= c
->last_tmp
;
51 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
53 if (tmp
.nr
== c
->last_tmp
-1)
57 static void release_tmps( struct brw_vs_compile
*c
)
59 c
->last_tmp
= c
->first_tmp
;
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
68 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
70 GLuint i
, reg
= 0, mrf
;
71 int attributes_in_vue
;
74 if (c
->vp
->program
.Base
.Parameters
->NumParameters
>= 6)
75 c
->vp
->use_const_buffer
= 1;
78 c
->vp
->use_const_buffer
= GL_FALSE
;
79 /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
81 /* r0 -- reserved as usual
83 c
->r0
= brw_vec8_grf(reg
, 0);
86 /* User clip planes from curbe:
88 if (c
->key
.nr_userclip
) {
89 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
90 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
93 /* Deal with curbe alignment:
95 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
98 /* Vertex program parameters from curbe:
100 if (c
->vp
->use_const_buffer
) {
101 /* get constants from a real constant buffer */
102 c
->prog_data
.curb_read_length
= 0;
103 c
->prog_data
.nr_params
= 4; /* XXX 0 causes a bug elsewhere... */
106 /* use a section of the GRF for constants */
107 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
108 for (i
= 0; i
< nr_params
; i
++) {
109 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
111 reg
+= (nr_params
+ 1) / 2;
112 c
->prog_data
.curb_read_length
= reg
- 1;
114 c
->prog_data
.nr_params
= nr_params
* 4;
117 /* Allocate input regs:
120 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
121 if (c
->prog_data
.inputs_read
& (1 << i
)) {
123 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
127 /* If there are no inputs, we'll still be reading one attribute's worth
128 * because it's required -- see urb_read_length setting.
130 if (c
->nr_inputs
== 0)
133 /* Allocate outputs: TODO: could organize the non-position outputs
134 * to go straight into message regs.
137 c
->first_output
= reg
;
139 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
140 if (c
->prog_data
.outputs_written
& (1 << i
)) {
142 if (i
== VERT_RESULT_HPOS
) {
143 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
146 else if (i
== VERT_RESULT_PSIZ
) {
147 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
149 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
152 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
158 /* Allocate program temporaries:
160 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
161 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
165 /* Address reg(s). Don't try to use the internal address reg until
168 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
169 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
173 BRW_VERTICAL_STRIDE_8
,
175 BRW_HORIZONTAL_STRIDE_1
,
181 if (c
->vp
->use_const_buffer
) {
182 for (i
= 0; i
< 3; i
++) {
183 c
->current_const
[i
].index
= -1;
184 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
189 for (i
= 0; i
< 128; i
++) {
190 if (c
->output_regs
[i
].used_in_src
) {
191 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
196 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
199 /* Some opcodes need an internal temporary:
202 c
->last_tmp
= reg
; /* for allocation purposes */
204 /* Each input reg holds data from two vertices. The
205 * urb_read_length is the number of registers read from *each*
206 * vertex urb, so is half the amount:
208 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
209 /* Setting this field to 0 leads to undefined behavior according to the
210 * the VS_STATE docs. Our VUEs will always have at least one attribute
211 * sitting in them, even if it's padding.
213 if (c
->prog_data
.urb_read_length
== 0)
214 c
->prog_data
.urb_read_length
= 1;
216 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
217 * them to fit the biggest thing they need to.
219 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
221 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
223 c
->prog_data
.total_grf
= reg
;
225 if (INTEL_DEBUG
& DEBUG_VS
) {
226 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
227 _mesa_printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
228 _mesa_printf("%s reg = %d\n", __FUNCTION__
, reg
);
234 * If an instruction uses a temp reg both as a src and the dest, we
235 * sometimes need to allocate an intermediate temporary.
237 static void unalias1( struct brw_vs_compile
*c
,
240 void (*func
)( struct brw_vs_compile
*,
244 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
245 struct brw_compile
*p
= &c
->func
;
246 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
248 brw_MOV(p
, dst
, tmp
);
258 * Checkes if 2-operand instruction needs an intermediate temporary.
260 static void unalias2( struct brw_vs_compile
*c
,
264 void (*func
)( struct brw_vs_compile
*,
269 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
270 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
271 struct brw_compile
*p
= &c
->func
;
272 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
273 func(c
, tmp
, arg0
, arg1
);
274 brw_MOV(p
, dst
, tmp
);
278 func(c
, dst
, arg0
, arg1
);
284 * Checkes if 3-operand instruction needs an intermediate temporary.
286 static void unalias3( struct brw_vs_compile
*c
,
291 void (*func
)( struct brw_vs_compile
*,
297 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
298 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
299 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
300 struct brw_compile
*p
= &c
->func
;
301 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
302 func(c
, tmp
, arg0
, arg1
, arg2
);
303 brw_MOV(p
, dst
, tmp
);
307 func(c
, dst
, arg0
, arg1
, arg2
);
311 static void emit_sop( struct brw_compile
*p
,
317 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
318 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
319 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
320 brw_set_predicate_control_flag_value(p
, 0xff);
323 static void emit_seq( struct brw_compile
*p
,
326 struct brw_reg arg1
)
328 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
331 static void emit_sne( struct brw_compile
*p
,
334 struct brw_reg arg1
)
336 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
338 static void emit_slt( struct brw_compile
*p
,
341 struct brw_reg arg1
)
343 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
346 static void emit_sle( struct brw_compile
*p
,
349 struct brw_reg arg1
)
351 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
354 static void emit_sgt( struct brw_compile
*p
,
357 struct brw_reg arg1
)
359 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
362 static void emit_sge( struct brw_compile
*p
,
365 struct brw_reg arg1
)
367 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
370 static void emit_max( struct brw_compile
*p
,
373 struct brw_reg arg1
)
375 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
376 brw_SEL(p
, dst
, arg1
, arg0
);
377 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
380 static void emit_min( struct brw_compile
*p
,
383 struct brw_reg arg1
)
385 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
386 brw_SEL(p
, dst
, arg0
, arg1
);
387 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
391 static void emit_math1( struct brw_vs_compile
*c
,
397 /* There are various odd behaviours with SEND on the simulator. In
398 * addition there are documented issues with the fact that the GEN4
399 * processor doesn't do dependency control properly on SEND
400 * results. So, on balance, this kludge to get around failures
401 * with writemasked math results looks like it might be necessary
402 * whether that turns out to be a simulator bug or not:
404 struct brw_compile
*p
= &c
->func
;
405 struct brw_reg tmp
= dst
;
406 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
407 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
415 BRW_MATH_SATURATE_NONE
,
418 BRW_MATH_DATA_SCALAR
,
422 brw_MOV(p
, dst
, tmp
);
428 static void emit_math2( struct brw_vs_compile
*c
,
435 struct brw_compile
*p
= &c
->func
;
436 struct brw_reg tmp
= dst
;
437 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
438 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
443 brw_MOV(p
, brw_message_reg(3), arg1
);
448 BRW_MATH_SATURATE_NONE
,
451 BRW_MATH_DATA_SCALAR
,
455 brw_MOV(p
, dst
, tmp
);
461 static void emit_exp_noalias( struct brw_vs_compile
*c
,
463 struct brw_reg arg0
)
465 struct brw_compile
*p
= &c
->func
;
468 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
469 struct brw_reg tmp
= get_tmp(c
);
470 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
472 /* tmp_d = floor(arg0.x) */
473 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
475 /* result[0] = 2.0 ^ tmp */
477 /* Adjust exponent for floating point:
480 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
482 /* Install exponent and sign.
483 * Excess drops off the edge:
485 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
486 tmp_d
, brw_imm_d(23));
491 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
492 /* result[1] = arg0.x - floor(arg0.x) */
493 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
496 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
497 /* As with the LOG instruction, we might be better off just
498 * doing a taylor expansion here, seeing as we have to do all
501 * If mathbox partial precision is too low, consider also:
502 * result[3] = result[0] * EXP(result[1])
505 BRW_MATH_FUNCTION_EXP
,
506 brw_writemask(dst
, WRITEMASK_Z
),
507 brw_swizzle1(arg0
, 0),
508 BRW_MATH_PRECISION_FULL
);
511 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
512 /* result[3] = 1.0; */
513 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
518 static void emit_log_noalias( struct brw_vs_compile
*c
,
520 struct brw_reg arg0
)
522 struct brw_compile
*p
= &c
->func
;
523 struct brw_reg tmp
= dst
;
524 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
525 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
526 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
527 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
531 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
534 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
537 * These almost look likey they could be joined up, but not really
540 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
541 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
543 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
545 brw_writemask(tmp_ud
, WRITEMASK_X
),
546 brw_swizzle1(arg0_ud
, 0),
547 brw_imm_ud((1U<<31)-1));
550 brw_writemask(tmp_ud
, WRITEMASK_X
),
555 brw_writemask(tmp
, WRITEMASK_X
),
556 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
560 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
562 brw_writemask(tmp_ud
, WRITEMASK_Y
),
563 brw_swizzle1(arg0_ud
, 0),
564 brw_imm_ud((1<<23)-1));
567 brw_writemask(tmp_ud
, WRITEMASK_Y
),
569 brw_imm_ud(127<<23));
572 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
573 /* result[2] = result[0] + LOG2(result[1]); */
575 /* Why bother? The above is just a hint how to do this with a
576 * taylor series. Maybe we *should* use a taylor series as by
577 * the time all the above has been done it's almost certainly
578 * quicker than calling the mathbox, even with low precision.
581 * - result[0] + mathbox.LOG2(result[1])
582 * - mathbox.LOG2(arg0.x)
583 * - result[0] + inline_taylor_approx(result[1])
586 BRW_MATH_FUNCTION_LOG
,
587 brw_writemask(tmp
, WRITEMASK_Z
),
588 brw_swizzle1(tmp
, 1),
589 BRW_MATH_PRECISION_FULL
);
592 brw_writemask(tmp
, WRITEMASK_Z
),
593 brw_swizzle1(tmp
, 2),
594 brw_swizzle1(tmp
, 0));
597 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
598 /* result[3] = 1.0; */
599 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
603 brw_MOV(p
, dst
, tmp
);
609 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
611 static void emit_dst_noalias( struct brw_vs_compile
*c
,
616 struct brw_compile
*p
= &c
->func
;
618 /* There must be a better way to do this:
620 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
621 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
622 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
623 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
624 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
625 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
626 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
627 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
631 static void emit_xpd( struct brw_compile
*p
,
636 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
637 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
641 static void emit_lit_noalias( struct brw_vs_compile
*c
,
643 struct brw_reg arg0
)
645 struct brw_compile
*p
= &c
->func
;
646 struct brw_instruction
*if_insn
;
647 struct brw_reg tmp
= dst
;
648 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
653 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
654 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
656 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
657 * to get all channels active inside the IF. In the clipping code
658 * we run with NoMask, so it's not an option and we can use
659 * BRW_EXECUTE_1 for all comparisions.
661 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
662 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
664 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
666 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
667 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
668 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
671 BRW_MATH_FUNCTION_POW
,
672 brw_writemask(dst
, WRITEMASK_Z
),
673 brw_swizzle1(tmp
, 2),
674 brw_swizzle1(arg0
, 3),
675 BRW_MATH_PRECISION_PARTIAL
);
678 brw_ENDIF(p
, if_insn
);
683 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
689 struct brw_compile
*p
= &c
->func
;
691 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
692 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
693 brw_MAC(p
, dst
, arg0
, arg1
);
696 /** 3 or 4-component vector normalization */
697 static void emit_nrm( struct brw_vs_compile
*c
,
702 struct brw_compile
*p
= &c
->func
;
703 struct brw_reg tmp
= get_tmp(c
);
705 /* tmp = dot(arg0, arg0) */
707 brw_DP3(p
, tmp
, arg0
, arg0
);
709 brw_DP4(p
, tmp
, arg0
, arg0
);
711 /* tmp = 1 / sqrt(tmp) */
712 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
714 /* dst = arg0 * tmp */
715 brw_MUL(p
, dst
, arg0
, tmp
);
721 static struct brw_reg
722 get_constant(struct brw_vs_compile
*c
,
723 const struct prog_instruction
*inst
,
726 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
727 struct brw_compile
*p
= &c
->func
;
728 struct brw_reg const_reg
;
729 struct brw_reg const2_reg
;
731 assert(argIndex
< 3);
733 if (c
->current_const
[argIndex
].index
!= src
->Index
|| src
->RelAddr
) {
734 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
736 c
->current_const
[argIndex
].index
= src
->Index
;
739 printf(" fetch const[%d] for arg %d into reg %d\n",
740 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
742 /* need to fetch the constant now */
744 c
->current_const
[argIndex
].reg
,/* writeback dest */
746 src
->RelAddr
, /* relative indexing? */
747 addrReg
, /* address register */
748 16 * src
->Index
, /* byte offset */
749 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
754 const2_reg
= get_tmp(c
);
756 /* use upper half of address reg for second read */
757 addrReg
= stride(addrReg
, 0, 4, 0);
761 const2_reg
, /* writeback dest */
763 src
->RelAddr
, /* relative indexing? */
764 addrReg
, /* address register */
765 16 * src
->Index
, /* byte offset */
766 SURF_INDEX_VERT_CONST_BUFFER
771 const_reg
= c
->current_const
[argIndex
].reg
;
774 /* merge the two Owords into the constant register */
775 /* const_reg[7..4] = const2_reg[7..4] */
777 suboffset(stride(const_reg
, 0, 4, 1), 4),
778 suboffset(stride(const2_reg
, 0, 4, 1), 4));
779 release_tmp(c
, const2_reg
);
782 /* replicate lower four floats into upper half (to get XYZWXYZW) */
783 const_reg
= stride(const_reg
, 0, 4, 0);
792 /* TODO: relative addressing!
794 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
795 gl_register_file file
,
799 case PROGRAM_TEMPORARY
:
802 assert(c
->regs
[file
][index
].nr
!= 0);
803 return c
->regs
[file
][index
];
804 case PROGRAM_STATE_VAR
:
805 case PROGRAM_CONSTANT
:
806 case PROGRAM_UNIFORM
:
807 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
808 return c
->regs
[PROGRAM_STATE_VAR
][index
];
809 case PROGRAM_ADDRESS
:
811 return c
->regs
[file
][index
];
813 case PROGRAM_UNDEFINED
: /* undef values */
814 return brw_null_reg();
816 case PROGRAM_LOCAL_PARAM
:
817 case PROGRAM_ENV_PARAM
:
818 case PROGRAM_WRITE_ONLY
:
821 return brw_null_reg();
827 * Indirect addressing: get reg[[arg] + offset].
829 static struct brw_reg
deref( struct brw_vs_compile
*c
,
833 struct brw_compile
*p
= &c
->func
;
834 struct brw_reg tmp
= vec4(get_tmp(c
));
835 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
836 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
837 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
838 struct brw_reg indirect
= brw_vec4_indirect(0,0);
841 brw_push_insn_state(p
);
842 brw_set_access_mode(p
, BRW_ALIGN_1
);
844 /* This is pretty clunky - load the address register twice and
845 * fetch each 4-dword value in turn. There must be a way to do
846 * this in a single pass, but I couldn't get it to work.
848 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
849 brw_MOV(p
, tmp
, indirect
);
851 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
852 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
854 brw_pop_insn_state(p
);
857 /* NOTE: tmp not released */
863 * Get brw reg corresponding to the instruction's [argIndex] src reg.
864 * TODO: relative addressing!
866 static struct brw_reg
867 get_src_reg( struct brw_vs_compile
*c
,
868 const struct prog_instruction
*inst
,
871 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
872 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
873 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
876 case PROGRAM_TEMPORARY
:
880 return deref(c
, c
->regs
[file
][0], index
);
883 assert(c
->regs
[file
][index
].nr
!= 0);
884 return c
->regs
[file
][index
];
887 case PROGRAM_STATE_VAR
:
888 case PROGRAM_CONSTANT
:
889 case PROGRAM_UNIFORM
:
890 if (c
->vp
->use_const_buffer
) {
891 return get_constant(c
, inst
, argIndex
);
894 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
897 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
898 return c
->regs
[PROGRAM_STATE_VAR
][index
];
900 case PROGRAM_ADDRESS
:
902 return c
->regs
[file
][index
];
904 case PROGRAM_UNDEFINED
:
905 /* this is a normal case since we loop over all three src args */
906 return brw_null_reg();
908 case PROGRAM_LOCAL_PARAM
:
909 case PROGRAM_ENV_PARAM
:
910 case PROGRAM_WRITE_ONLY
:
913 return brw_null_reg();
918 static void emit_arl( struct brw_vs_compile
*c
,
920 struct brw_reg arg0
)
922 struct brw_compile
*p
= &c
->func
;
923 struct brw_reg tmp
= dst
;
924 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
929 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
930 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
938 * Return the brw reg for the given instruction's src argument.
939 * Will return mangled results for SWZ op. The emit_swz() function
940 * ignores this result and recalculates taking extended swizzles into
943 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
944 const struct prog_instruction
*inst
,
947 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
950 if (src
->File
== PROGRAM_UNDEFINED
)
951 return brw_null_reg();
953 reg
= get_src_reg(c
, inst
, argIndex
);
955 /* Convert 3-bit swizzle to 2-bit.
957 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
958 GET_SWZ(src
->Swizzle
, 1),
959 GET_SWZ(src
->Swizzle
, 2),
960 GET_SWZ(src
->Swizzle
, 3));
962 /* Note this is ok for non-swizzle instructions:
964 reg
.negate
= src
->Negate
? 1 : 0;
971 * Get brw register for the given program dest register.
973 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
974 struct prog_dst_register dst
)
979 case PROGRAM_TEMPORARY
:
981 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
982 reg
= c
->regs
[dst
.File
][dst
.Index
];
984 case PROGRAM_ADDRESS
:
985 assert(dst
.Index
== 0);
986 reg
= c
->regs
[dst
.File
][dst
.Index
];
988 case PROGRAM_UNDEFINED
:
989 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
990 reg
= brw_null_reg();
994 reg
= brw_null_reg();
997 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1003 static void emit_swz( struct brw_vs_compile
*c
,
1005 const struct prog_instruction
*inst
)
1007 const GLuint argIndex
= 0;
1008 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1009 struct brw_compile
*p
= &c
->func
;
1010 GLuint zeros_mask
= 0;
1011 GLuint ones_mask
= 0;
1012 GLuint src_mask
= 0;
1014 GLboolean need_tmp
= (src
.Negate
&&
1015 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1016 struct brw_reg tmp
= dst
;
1022 for (i
= 0; i
< 4; i
++) {
1023 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1024 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1043 /* Do src first, in case dst aliases src:
1046 struct brw_reg arg0
;
1048 arg0
= get_src_reg(c
, inst
, argIndex
);
1050 arg0
= brw_swizzle(arg0
,
1051 src_swz
[0], src_swz
[1],
1052 src_swz
[2], src_swz
[3]);
1054 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1058 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1061 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1064 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1067 brw_MOV(p
, dst
, tmp
);
1068 release_tmp(c
, tmp
);
1074 * Post-vertex-program processing. Send the results to the URB.
1076 static void emit_vertex_write( struct brw_vs_compile
*c
)
1078 struct brw_compile
*p
= &c
->func
;
1079 struct brw_reg m0
= brw_message_reg(0);
1080 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1083 if (c
->key
.copy_edgeflag
) {
1085 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1086 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1089 /* Build ndc coords */
1091 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1092 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1094 /* Update the header for point size, user clipping flags, and -ve rhw
1097 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
1098 c
->key
.nr_userclip
|| !BRW_IS_G4X(p
->brw
))
1100 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1103 brw_MOV(p
, header1
, brw_imm_ud(0));
1105 brw_set_access_mode(p
, BRW_ALIGN_16
);
1107 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
1108 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1109 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1110 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1113 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1114 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1115 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1116 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1117 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1120 /* i965 clipping workaround:
1121 * 1) Test for -ve rhw
1123 * set ndc = (0,0,0,0)
1126 * Later, clipping will detect ucp[6] and ensure the primitive is
1127 * clipped against all fixed planes.
1129 if (!BRW_IS_G4X(p
->brw
)) {
1131 vec8(brw_null_reg()),
1133 brw_swizzle1(ndc
, 3),
1136 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1137 brw_MOV(p
, ndc
, brw_imm_f(0));
1138 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1141 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1142 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1143 brw_set_access_mode(p
, BRW_ALIGN_16
);
1145 release_tmp(c
, header1
);
1148 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1151 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1152 * of zeros followed by two sets of NDC coordinates:
1154 brw_set_access_mode(p
, BRW_ALIGN_1
);
1155 brw_MOV(p
, offset(m0
, 2), ndc
);
1156 brw_MOV(p
, offset(m0
, 3), pos
);
1159 brw_null_reg(), /* dest */
1160 0, /* starting mrf reg nr */
1164 c
->nr_outputs
+ 3, /* msg len */
1165 0, /* response len */
1167 1, /* writes complete */
1168 0, /* urb destination offset */
1169 BRW_URB_SWIZZLE_INTERLEAVE
);
1174 * Called after code generation to resolve subroutine calls and the
1176 * \param end_inst points to brw code for END instruction
1177 * \param last_inst points to last instruction emitted before vertex write
1180 post_vs_emit( struct brw_vs_compile
*c
,
1181 struct brw_instruction
*end_inst
,
1182 struct brw_instruction
*last_inst
)
1186 brw_resolve_cals(&c
->func
);
1188 /* patch up the END code to jump past subroutines, etc */
1189 offset
= last_inst
- end_inst
;
1190 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1194 /* Emit the vertex program instructions here.
1196 void brw_vs_emit(struct brw_vs_compile
*c
)
1199 struct brw_compile
*p
= &c
->func
;
1200 GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1201 GLuint insn
, if_insn
= 0;
1202 GLuint end_offset
= 0;
1203 struct brw_instruction
*end_inst
, *last_inst
;
1204 struct brw_instruction
*if_inst
[MAX_IFSN
];
1205 struct brw_indirect stack_index
= brw_indirect(0, 0);
1210 if (INTEL_DEBUG
& DEBUG_VS
) {
1211 _mesa_printf("vs-emit:\n");
1212 _mesa_print_program(&c
->vp
->program
.Base
);
1216 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1217 brw_set_access_mode(p
, BRW_ALIGN_16
);
1219 /* Message registers can't be read, so copy the output into GRF register
1220 if they are used in source registers */
1221 for (insn
= 0; insn
< nr_insns
; insn
++) {
1223 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1224 for (i
= 0; i
< 3; i
++) {
1225 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1226 GLuint index
= src
->Index
;
1227 GLuint file
= src
->File
;
1228 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1229 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1233 /* Static register allocation
1235 brw_vs_alloc_regs(c
);
1236 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1238 for (insn
= 0; insn
< nr_insns
; insn
++) {
1240 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1241 struct brw_reg args
[3], dst
;
1245 printf("%d: ", insn
);
1246 _mesa_print_instruction(inst
);
1249 /* Get argument regs. SWZ is special and does this itself.
1251 if (inst
->Opcode
!= OPCODE_SWZ
)
1252 for (i
= 0; i
< 3; i
++) {
1253 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1256 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1257 args
[i
] = c
->output_regs
[index
].reg
;
1259 args
[i
] = get_arg(c
, inst
, i
);
1262 /* Get dest regs. Note that it is possible for a reg to be both
1263 * dst and arg, given the static allocation of registers. So
1264 * care needs to be taken emitting multi-operation instructions.
1266 index
= inst
->DstReg
.Index
;
1267 file
= inst
->DstReg
.File
;
1268 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1269 dst
= c
->output_regs
[index
].reg
;
1271 dst
= get_dst(c
, inst
->DstReg
);
1273 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1274 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1275 inst
->SaturateMode
);
1278 switch (inst
->Opcode
) {
1280 brw_MOV(p
, dst
, brw_abs(args
[0]));
1283 brw_ADD(p
, dst
, args
[0], args
[1]);
1286 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1289 brw_DP3(p
, dst
, args
[0], args
[1]);
1292 brw_DP4(p
, dst
, args
[0], args
[1]);
1295 brw_DPH(p
, dst
, args
[0], args
[1]);
1298 emit_nrm(c
, dst
, args
[0], 3);
1301 emit_nrm(c
, dst
, args
[0], 4);
1304 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1307 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1310 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1313 emit_arl(c
, dst
, args
[0]);
1316 brw_RNDD(p
, dst
, args
[0]);
1319 brw_FRC(p
, dst
, args
[0]);
1322 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1325 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1328 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1331 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1334 brw_MOV(p
, brw_acc_reg(), args
[2]);
1335 brw_MAC(p
, dst
, args
[0], args
[1]);
1338 emit_max(p
, dst
, args
[0], args
[1]);
1341 emit_min(p
, dst
, args
[0], args
[1]);
1344 brw_MOV(p
, dst
, args
[0]);
1347 brw_MUL(p
, dst
, args
[0], args
[1]);
1350 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1353 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1356 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1360 emit_seq(p
, dst
, args
[0], args
[1]);
1363 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1366 emit_sne(p
, dst
, args
[0], args
[1]);
1369 emit_sge(p
, dst
, args
[0], args
[1]);
1372 emit_sgt(p
, dst
, args
[0], args
[1]);
1375 emit_slt(p
, dst
, args
[0], args
[1]);
1378 emit_sle(p
, dst
, args
[0], args
[1]);
1381 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1384 /* The args[0] value can't be used here as it won't have
1385 * correctly encoded the full swizzle:
1387 emit_swz(c
, dst
, inst
);
1390 /* round toward zero */
1391 brw_RNDZ(p
, dst
, args
[0]);
1394 emit_xpd(p
, dst
, args
[0], args
[1]);
1397 assert(if_insn
< MAX_IFSN
);
1398 if_inst
[if_insn
++] = brw_IF(p
, BRW_EXECUTE_8
);
1401 if_inst
[if_insn
-1] = brw_ELSE(p
, if_inst
[if_insn
-1]);
1404 assert(if_insn
> 0);
1405 brw_ENDIF(p
, if_inst
[--if_insn
]);
1408 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1409 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1410 brw_set_predicate_control_flag_value(p
, 0xff);
1413 brw_set_access_mode(p
, BRW_ALIGN_1
);
1414 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1415 brw_set_access_mode(p
, BRW_ALIGN_16
);
1416 brw_ADD(p
, get_addr_reg(stack_index
),
1417 get_addr_reg(stack_index
), brw_imm_d(4));
1418 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1419 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1422 brw_ADD(p
, get_addr_reg(stack_index
),
1423 get_addr_reg(stack_index
), brw_imm_d(-4));
1424 brw_set_access_mode(p
, BRW_ALIGN_1
);
1425 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1426 brw_set_access_mode(p
, BRW_ALIGN_16
);
1429 end_offset
= p
->nr_insn
;
1430 /* this instruction will get patched later to jump past subroutine
1433 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1439 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1445 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1446 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1447 _mesa_opcode_string(inst
->Opcode
) :
1451 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1452 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1453 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1454 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1457 /* Result color clamping.
1459 * When destination register is an output register and
1460 * it's primary/secondary front/back color, we have to clamp
1461 * the result to [0,1]. This is done by enabling the
1462 * saturation bit for the last instruction.
1464 * We don't use brw_set_saturate() as it modifies
1465 * p->current->header.saturate, which affects all the subsequent
1466 * instructions. Instead, we directly modify the header
1467 * of the last (already stored) instruction.
1469 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1470 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1471 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1472 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1473 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1474 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1481 end_inst
= &p
->store
[end_offset
];
1482 last_inst
= &p
->store
[p
->nr_insn
];
1484 /* The END instruction will be patched to jump to this code */
1485 emit_vertex_write(c
);
1487 post_vs_emit(c
, end_inst
, last_inst
);