2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
41 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
43 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
45 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
46 c
->prog_data
.total_grf
= c
->last_tmp
;
51 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
53 if (tmp
.nr
== c
->last_tmp
-1)
57 static void release_tmps( struct brw_vs_compile
*c
)
59 c
->last_tmp
= c
->first_tmp
;
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
68 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
70 GLuint i
, reg
= 0, mrf
;
71 int attributes_in_vue
;
73 /* Determine whether to use a real constant buffer or use a block
74 * of GRF registers for constants. The later is faster but only
75 * works if everything fits in the GRF.
76 * XXX this heuristic/check may need some fine tuning...
78 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
79 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
80 c
->vp
->use_const_buffer
= GL_TRUE
;
82 c
->vp
->use_const_buffer
= GL_FALSE
;
84 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
86 /* r0 -- reserved as usual
88 c
->r0
= brw_vec8_grf(reg
, 0);
91 /* User clip planes from curbe:
93 if (c
->key
.nr_userclip
) {
94 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
95 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
98 /* Deal with curbe alignment:
100 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
103 /* Vertex program parameters from curbe:
105 if (c
->vp
->use_const_buffer
) {
106 /* get constants from a real constant buffer */
107 c
->prog_data
.curb_read_length
= 0;
108 c
->prog_data
.nr_params
= 4; /* XXX 0 causes a bug elsewhere... */
111 /* use a section of the GRF for constants */
112 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
113 for (i
= 0; i
< nr_params
; i
++) {
114 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
116 reg
+= (nr_params
+ 1) / 2;
117 c
->prog_data
.curb_read_length
= reg
- 1;
119 c
->prog_data
.nr_params
= nr_params
* 4;
122 /* Allocate input regs:
125 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
126 if (c
->prog_data
.inputs_read
& (1 << i
)) {
128 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
132 /* If there are no inputs, we'll still be reading one attribute's worth
133 * because it's required -- see urb_read_length setting.
135 if (c
->nr_inputs
== 0)
138 /* Allocate outputs. The non-position outputs go straight into message regs.
141 c
->first_output
= reg
;
142 c
->first_overflow_output
= 0;
144 if (BRW_IS_IGDNG(c
->func
.brw
))
149 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
150 if (c
->prog_data
.outputs_written
& (1 << i
)) {
152 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
153 if (i
== VERT_RESULT_HPOS
) {
154 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
157 else if (i
== VERT_RESULT_PSIZ
) {
158 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
160 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
164 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
168 /* too many vertex results to fit in MRF, use GRF for overflow */
169 if (!c
->first_overflow_output
)
170 c
->first_overflow_output
= i
;
171 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
178 /* Allocate program temporaries:
180 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
181 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
185 /* Address reg(s). Don't try to use the internal address reg until
188 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
189 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
193 BRW_VERTICAL_STRIDE_8
,
195 BRW_HORIZONTAL_STRIDE_1
,
201 if (c
->vp
->use_const_buffer
) {
202 for (i
= 0; i
< 3; i
++) {
203 c
->current_const
[i
].index
= -1;
204 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
209 for (i
= 0; i
< 128; i
++) {
210 if (c
->output_regs
[i
].used_in_src
) {
211 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
216 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
219 /* Some opcodes need an internal temporary:
222 c
->last_tmp
= reg
; /* for allocation purposes */
224 /* Each input reg holds data from two vertices. The
225 * urb_read_length is the number of registers read from *each*
226 * vertex urb, so is half the amount:
228 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
229 /* Setting this field to 0 leads to undefined behavior according to the
230 * the VS_STATE docs. Our VUEs will always have at least one attribute
231 * sitting in them, even if it's padding.
233 if (c
->prog_data
.urb_read_length
== 0)
234 c
->prog_data
.urb_read_length
= 1;
236 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
237 * them to fit the biggest thing they need to.
239 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
241 if (BRW_IS_IGDNG(c
->func
.brw
))
242 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
244 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
246 c
->prog_data
.total_grf
= reg
;
248 if (INTEL_DEBUG
& DEBUG_VS
) {
249 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
250 _mesa_printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
251 _mesa_printf("%s reg = %d\n", __FUNCTION__
, reg
);
257 * If an instruction uses a temp reg both as a src and the dest, we
258 * sometimes need to allocate an intermediate temporary.
260 static void unalias1( struct brw_vs_compile
*c
,
263 void (*func
)( struct brw_vs_compile
*,
267 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
268 struct brw_compile
*p
= &c
->func
;
269 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
271 brw_MOV(p
, dst
, tmp
);
281 * Checkes if 2-operand instruction needs an intermediate temporary.
283 static void unalias2( struct brw_vs_compile
*c
,
287 void (*func
)( struct brw_vs_compile
*,
292 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
293 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
294 struct brw_compile
*p
= &c
->func
;
295 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
296 func(c
, tmp
, arg0
, arg1
);
297 brw_MOV(p
, dst
, tmp
);
301 func(c
, dst
, arg0
, arg1
);
307 * Checkes if 3-operand instruction needs an intermediate temporary.
309 static void unalias3( struct brw_vs_compile
*c
,
314 void (*func
)( struct brw_vs_compile
*,
320 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
321 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
322 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
323 struct brw_compile
*p
= &c
->func
;
324 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
325 func(c
, tmp
, arg0
, arg1
, arg2
);
326 brw_MOV(p
, dst
, tmp
);
330 func(c
, dst
, arg0
, arg1
, arg2
);
334 static void emit_sop( struct brw_vs_compile
*c
,
340 struct brw_compile
*p
= &c
->func
;
342 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
343 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
344 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
345 brw_set_predicate_control_flag_value(p
, 0xff);
348 static void emit_seq( struct brw_vs_compile
*c
,
351 struct brw_reg arg1
)
353 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
356 static void emit_sne( struct brw_vs_compile
*c
,
359 struct brw_reg arg1
)
361 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
363 static void emit_slt( struct brw_vs_compile
*c
,
366 struct brw_reg arg1
)
368 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
371 static void emit_sle( struct brw_vs_compile
*c
,
374 struct brw_reg arg1
)
376 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
379 static void emit_sgt( struct brw_vs_compile
*c
,
382 struct brw_reg arg1
)
384 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
387 static void emit_sge( struct brw_vs_compile
*c
,
390 struct brw_reg arg1
)
392 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
395 static void emit_max( struct brw_compile
*p
,
398 struct brw_reg arg1
)
400 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
401 brw_SEL(p
, dst
, arg1
, arg0
);
402 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
405 static void emit_min( struct brw_compile
*p
,
408 struct brw_reg arg1
)
410 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
411 brw_SEL(p
, dst
, arg0
, arg1
);
412 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
416 static void emit_math1( struct brw_vs_compile
*c
,
422 /* There are various odd behaviours with SEND on the simulator. In
423 * addition there are documented issues with the fact that the GEN4
424 * processor doesn't do dependency control properly on SEND
425 * results. So, on balance, this kludge to get around failures
426 * with writemasked math results looks like it might be necessary
427 * whether that turns out to be a simulator bug or not:
429 struct brw_compile
*p
= &c
->func
;
430 struct brw_reg tmp
= dst
;
431 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
432 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
440 BRW_MATH_SATURATE_NONE
,
443 BRW_MATH_DATA_SCALAR
,
447 brw_MOV(p
, dst
, tmp
);
453 static void emit_math2( struct brw_vs_compile
*c
,
460 struct brw_compile
*p
= &c
->func
;
461 struct brw_reg tmp
= dst
;
462 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
463 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
468 brw_MOV(p
, brw_message_reg(3), arg1
);
473 BRW_MATH_SATURATE_NONE
,
476 BRW_MATH_DATA_SCALAR
,
480 brw_MOV(p
, dst
, tmp
);
486 static void emit_exp_noalias( struct brw_vs_compile
*c
,
488 struct brw_reg arg0
)
490 struct brw_compile
*p
= &c
->func
;
493 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
494 struct brw_reg tmp
= get_tmp(c
);
495 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
497 /* tmp_d = floor(arg0.x) */
498 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
500 /* result[0] = 2.0 ^ tmp */
502 /* Adjust exponent for floating point:
505 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
507 /* Install exponent and sign.
508 * Excess drops off the edge:
510 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
511 tmp_d
, brw_imm_d(23));
516 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
517 /* result[1] = arg0.x - floor(arg0.x) */
518 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
521 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
522 /* As with the LOG instruction, we might be better off just
523 * doing a taylor expansion here, seeing as we have to do all
526 * If mathbox partial precision is too low, consider also:
527 * result[3] = result[0] * EXP(result[1])
530 BRW_MATH_FUNCTION_EXP
,
531 brw_writemask(dst
, WRITEMASK_Z
),
532 brw_swizzle1(arg0
, 0),
533 BRW_MATH_PRECISION_FULL
);
536 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
537 /* result[3] = 1.0; */
538 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
543 static void emit_log_noalias( struct brw_vs_compile
*c
,
545 struct brw_reg arg0
)
547 struct brw_compile
*p
= &c
->func
;
548 struct brw_reg tmp
= dst
;
549 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
550 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
551 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
552 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
556 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
559 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
562 * These almost look likey they could be joined up, but not really
565 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
566 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
568 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
570 brw_writemask(tmp_ud
, WRITEMASK_X
),
571 brw_swizzle1(arg0_ud
, 0),
572 brw_imm_ud((1U<<31)-1));
575 brw_writemask(tmp_ud
, WRITEMASK_X
),
580 brw_writemask(tmp
, WRITEMASK_X
),
581 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
585 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
587 brw_writemask(tmp_ud
, WRITEMASK_Y
),
588 brw_swizzle1(arg0_ud
, 0),
589 brw_imm_ud((1<<23)-1));
592 brw_writemask(tmp_ud
, WRITEMASK_Y
),
594 brw_imm_ud(127<<23));
597 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
598 /* result[2] = result[0] + LOG2(result[1]); */
600 /* Why bother? The above is just a hint how to do this with a
601 * taylor series. Maybe we *should* use a taylor series as by
602 * the time all the above has been done it's almost certainly
603 * quicker than calling the mathbox, even with low precision.
606 * - result[0] + mathbox.LOG2(result[1])
607 * - mathbox.LOG2(arg0.x)
608 * - result[0] + inline_taylor_approx(result[1])
611 BRW_MATH_FUNCTION_LOG
,
612 brw_writemask(tmp
, WRITEMASK_Z
),
613 brw_swizzle1(tmp
, 1),
614 BRW_MATH_PRECISION_FULL
);
617 brw_writemask(tmp
, WRITEMASK_Z
),
618 brw_swizzle1(tmp
, 2),
619 brw_swizzle1(tmp
, 0));
622 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
623 /* result[3] = 1.0; */
624 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
628 brw_MOV(p
, dst
, tmp
);
634 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
636 static void emit_dst_noalias( struct brw_vs_compile
*c
,
641 struct brw_compile
*p
= &c
->func
;
643 /* There must be a better way to do this:
645 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
646 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
647 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
648 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
649 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
650 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
651 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
652 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
656 static void emit_xpd( struct brw_compile
*p
,
661 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
662 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
666 static void emit_lit_noalias( struct brw_vs_compile
*c
,
668 struct brw_reg arg0
)
670 struct brw_compile
*p
= &c
->func
;
671 struct brw_instruction
*if_insn
;
672 struct brw_reg tmp
= dst
;
673 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
678 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
679 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
681 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
682 * to get all channels active inside the IF. In the clipping code
683 * we run with NoMask, so it's not an option and we can use
684 * BRW_EXECUTE_1 for all comparisions.
686 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
687 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
689 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
691 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
692 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
693 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
696 BRW_MATH_FUNCTION_POW
,
697 brw_writemask(dst
, WRITEMASK_Z
),
698 brw_swizzle1(tmp
, 2),
699 brw_swizzle1(arg0
, 3),
700 BRW_MATH_PRECISION_PARTIAL
);
703 brw_ENDIF(p
, if_insn
);
708 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
714 struct brw_compile
*p
= &c
->func
;
716 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
717 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
718 brw_MAC(p
, dst
, arg0
, arg1
);
721 /** 3 or 4-component vector normalization */
722 static void emit_nrm( struct brw_vs_compile
*c
,
727 struct brw_compile
*p
= &c
->func
;
728 struct brw_reg tmp
= get_tmp(c
);
730 /* tmp = dot(arg0, arg0) */
732 brw_DP3(p
, tmp
, arg0
, arg0
);
734 brw_DP4(p
, tmp
, arg0
, arg0
);
736 /* tmp = 1 / sqrt(tmp) */
737 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
739 /* dst = arg0 * tmp */
740 brw_MUL(p
, dst
, arg0
, tmp
);
746 static struct brw_reg
747 get_constant(struct brw_vs_compile
*c
,
748 const struct prog_instruction
*inst
,
751 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
752 struct brw_compile
*p
= &c
->func
;
753 struct brw_reg const_reg
;
754 struct brw_reg const2_reg
;
755 const GLboolean relAddr
= src
->RelAddr
;
757 assert(argIndex
< 3);
759 if (c
->current_const
[argIndex
].index
!= src
->Index
|| relAddr
) {
760 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
762 c
->current_const
[argIndex
].index
= src
->Index
;
765 printf(" fetch const[%d] for arg %d into reg %d\n",
766 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
768 /* need to fetch the constant now */
770 c
->current_const
[argIndex
].reg
,/* writeback dest */
772 relAddr
, /* relative indexing? */
773 addrReg
, /* address register */
774 16 * src
->Index
, /* byte offset */
775 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
780 const2_reg
= get_tmp(c
);
782 /* use upper half of address reg for second read */
783 addrReg
= stride(addrReg
, 0, 4, 0);
787 const2_reg
, /* writeback dest */
789 relAddr
, /* relative indexing? */
790 addrReg
, /* address register */
791 16 * src
->Index
, /* byte offset */
792 SURF_INDEX_VERT_CONST_BUFFER
797 const_reg
= c
->current_const
[argIndex
].reg
;
800 /* merge the two Owords into the constant register */
801 /* const_reg[7..4] = const2_reg[7..4] */
803 suboffset(stride(const_reg
, 0, 4, 1), 4),
804 suboffset(stride(const2_reg
, 0, 4, 1), 4));
805 release_tmp(c
, const2_reg
);
808 /* replicate lower four floats into upper half (to get XYZWXYZW) */
809 const_reg
= stride(const_reg
, 0, 4, 0);
818 /* TODO: relative addressing!
820 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
821 gl_register_file file
,
825 case PROGRAM_TEMPORARY
:
828 assert(c
->regs
[file
][index
].nr
!= 0);
829 return c
->regs
[file
][index
];
830 case PROGRAM_STATE_VAR
:
831 case PROGRAM_CONSTANT
:
832 case PROGRAM_UNIFORM
:
833 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
834 return c
->regs
[PROGRAM_STATE_VAR
][index
];
835 case PROGRAM_ADDRESS
:
837 return c
->regs
[file
][index
];
839 case PROGRAM_UNDEFINED
: /* undef values */
840 return brw_null_reg();
842 case PROGRAM_LOCAL_PARAM
:
843 case PROGRAM_ENV_PARAM
:
844 case PROGRAM_WRITE_ONLY
:
847 return brw_null_reg();
853 * Indirect addressing: get reg[[arg] + offset].
855 static struct brw_reg
deref( struct brw_vs_compile
*c
,
859 struct brw_compile
*p
= &c
->func
;
860 struct brw_reg tmp
= vec4(get_tmp(c
));
861 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
862 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
863 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
864 struct brw_reg indirect
= brw_vec4_indirect(0,0);
867 brw_push_insn_state(p
);
868 brw_set_access_mode(p
, BRW_ALIGN_1
);
870 /* This is pretty clunky - load the address register twice and
871 * fetch each 4-dword value in turn. There must be a way to do
872 * this in a single pass, but I couldn't get it to work.
874 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
875 brw_MOV(p
, tmp
, indirect
);
877 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
878 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
880 brw_pop_insn_state(p
);
883 /* NOTE: tmp not released */
889 * Get brw reg corresponding to the instruction's [argIndex] src reg.
890 * TODO: relative addressing!
892 static struct brw_reg
893 get_src_reg( struct brw_vs_compile
*c
,
894 const struct prog_instruction
*inst
,
897 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
898 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
899 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
902 case PROGRAM_TEMPORARY
:
906 return deref(c
, c
->regs
[file
][0], index
);
909 assert(c
->regs
[file
][index
].nr
!= 0);
910 return c
->regs
[file
][index
];
913 case PROGRAM_STATE_VAR
:
914 case PROGRAM_CONSTANT
:
915 case PROGRAM_UNIFORM
:
916 case PROGRAM_ENV_PARAM
:
917 case PROGRAM_LOCAL_PARAM
:
918 if (c
->vp
->use_const_buffer
) {
919 return get_constant(c
, inst
, argIndex
);
922 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
925 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
926 return c
->regs
[PROGRAM_STATE_VAR
][index
];
928 case PROGRAM_ADDRESS
:
930 return c
->regs
[file
][index
];
932 case PROGRAM_UNDEFINED
:
933 /* this is a normal case since we loop over all three src args */
934 return brw_null_reg();
936 case PROGRAM_WRITE_ONLY
:
939 return brw_null_reg();
944 static void emit_arl( struct brw_vs_compile
*c
,
946 struct brw_reg arg0
)
948 struct brw_compile
*p
= &c
->func
;
949 struct brw_reg tmp
= dst
;
950 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
955 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
956 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
964 * Return the brw reg for the given instruction's src argument.
965 * Will return mangled results for SWZ op. The emit_swz() function
966 * ignores this result and recalculates taking extended swizzles into
969 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
970 const struct prog_instruction
*inst
,
973 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
976 if (src
->File
== PROGRAM_UNDEFINED
)
977 return brw_null_reg();
979 reg
= get_src_reg(c
, inst
, argIndex
);
981 /* Convert 3-bit swizzle to 2-bit.
983 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
984 GET_SWZ(src
->Swizzle
, 1),
985 GET_SWZ(src
->Swizzle
, 2),
986 GET_SWZ(src
->Swizzle
, 3));
988 /* Note this is ok for non-swizzle instructions:
990 reg
.negate
= src
->Negate
? 1 : 0;
997 * Get brw register for the given program dest register.
999 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1000 struct prog_dst_register dst
)
1005 case PROGRAM_TEMPORARY
:
1006 case PROGRAM_OUTPUT
:
1007 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1008 reg
= c
->regs
[dst
.File
][dst
.Index
];
1010 case PROGRAM_ADDRESS
:
1011 assert(dst
.Index
== 0);
1012 reg
= c
->regs
[dst
.File
][dst
.Index
];
1014 case PROGRAM_UNDEFINED
:
1015 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1016 reg
= brw_null_reg();
1020 reg
= brw_null_reg();
1023 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1029 static void emit_swz( struct brw_vs_compile
*c
,
1031 const struct prog_instruction
*inst
)
1033 const GLuint argIndex
= 0;
1034 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1035 struct brw_compile
*p
= &c
->func
;
1036 GLuint zeros_mask
= 0;
1037 GLuint ones_mask
= 0;
1038 GLuint src_mask
= 0;
1040 GLboolean need_tmp
= (src
.Negate
&&
1041 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1042 struct brw_reg tmp
= dst
;
1048 for (i
= 0; i
< 4; i
++) {
1049 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1050 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1069 /* Do src first, in case dst aliases src:
1072 struct brw_reg arg0
;
1074 arg0
= get_src_reg(c
, inst
, argIndex
);
1076 arg0
= brw_swizzle(arg0
,
1077 src_swz
[0], src_swz
[1],
1078 src_swz
[2], src_swz
[3]);
1080 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1084 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1087 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1090 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1093 brw_MOV(p
, dst
, tmp
);
1094 release_tmp(c
, tmp
);
1100 * Post-vertex-program processing. Send the results to the URB.
1102 static void emit_vertex_write( struct brw_vs_compile
*c
)
1104 struct brw_compile
*p
= &c
->func
;
1105 struct brw_reg m0
= brw_message_reg(0);
1106 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1109 GLuint len_vertext_header
= 2;
1111 if (c
->key
.copy_edgeflag
) {
1113 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1114 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1117 /* Build ndc coords */
1119 /* ndc = 1.0 / pos.w */
1120 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1121 /* ndc.xyz = pos * ndc */
1122 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1124 /* Update the header for point size, user clipping flags, and -ve rhw
1127 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
1128 c
->key
.nr_userclip
|| BRW_IS_965(p
->brw
))
1130 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1133 brw_MOV(p
, header1
, brw_imm_ud(0));
1135 brw_set_access_mode(p
, BRW_ALIGN_16
);
1137 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
1138 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1139 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1140 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1143 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1144 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1145 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1146 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1147 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1150 /* i965 clipping workaround:
1151 * 1) Test for -ve rhw
1153 * set ndc = (0,0,0,0)
1156 * Later, clipping will detect ucp[6] and ensure the primitive is
1157 * clipped against all fixed planes.
1159 if (BRW_IS_965(p
->brw
)) {
1161 vec8(brw_null_reg()),
1163 brw_swizzle1(ndc
, 3),
1166 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1167 brw_MOV(p
, ndc
, brw_imm_f(0));
1168 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1171 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1172 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1173 brw_set_access_mode(p
, BRW_ALIGN_16
);
1175 release_tmp(c
, header1
);
1178 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1181 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1182 * of zeros followed by two sets of NDC coordinates:
1184 brw_set_access_mode(p
, BRW_ALIGN_1
);
1185 brw_MOV(p
, offset(m0
, 2), ndc
);
1187 if (BRW_IS_IGDNG(p
->brw
)) {
1188 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1189 brw_MOV(p
, offset(m0
, 3), pos
); /* a portion of vertex header */
1190 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1191 * Seems it is useless for us.
1192 * m6 is used for aligning, so that the remainder of vertex element is
1195 brw_MOV(p
, offset(m0
, 7), pos
); /* the remainder of vertex element */
1196 len_vertext_header
= 6;
1198 brw_MOV(p
, offset(m0
, 3), pos
);
1199 len_vertext_header
= 2;
1202 eot
= (c
->first_overflow_output
== 0);
1205 brw_null_reg(), /* dest */
1206 0, /* starting mrf reg nr */
1210 MIN2(c
->nr_outputs
+ 1 + len_vertext_header
, (BRW_MAX_MRF
-1)), /* msg len */
1211 0, /* response len */
1213 eot
, /* writes complete */
1214 0, /* urb destination offset */
1215 BRW_URB_SWIZZLE_INTERLEAVE
);
1217 if (c
->first_overflow_output
> 0) {
1218 /* Not all of the vertex outputs/results fit into the MRF.
1219 * Move the overflowed attributes from the GRF to the MRF and
1220 * issue another brw_urb_WRITE().
1222 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1226 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1227 if (c
->prog_data
.outputs_written
& (1 << i
)) {
1228 /* move from GRF to MRF */
1229 brw_MOV(p
, brw_message_reg(4+mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1235 brw_null_reg(), /* dest */
1236 4, /* starting mrf reg nr */
1240 mrf
+1, /* msg len */
1241 0, /* response len */
1243 1, /* writes complete */
1244 BRW_MAX_MRF
-1, /* urb destination offset */
1245 BRW_URB_SWIZZLE_INTERLEAVE
);
1251 * Called after code generation to resolve subroutine calls and the
1253 * \param end_inst points to brw code for END instruction
1254 * \param last_inst points to last instruction emitted before vertex write
1257 post_vs_emit( struct brw_vs_compile
*c
,
1258 struct brw_instruction
*end_inst
,
1259 struct brw_instruction
*last_inst
)
1263 brw_resolve_cals(&c
->func
);
1265 /* patch up the END code to jump past subroutines, etc */
1266 offset
= last_inst
- end_inst
;
1268 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1270 end_inst
->header
.opcode
= BRW_OPCODE_NOP
;
1275 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1277 struct brw_compile
*p
= &c
->func
;
1278 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1280 if (p
->nr_insn
== 0)
1283 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1286 switch (prev_insn
->header
.opcode
) {
1287 case BRW_OPCODE_MOV
:
1288 case BRW_OPCODE_MAC
:
1289 case BRW_OPCODE_MUL
:
1290 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1291 prev_insn
->header
.execution_size
== val
.width
&&
1292 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1293 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1294 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1295 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1296 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1297 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1307 get_predicate(const struct prog_instruction
*inst
)
1309 if (inst
->DstReg
.CondMask
== COND_TR
)
1310 return BRW_PREDICATE_NONE
;
1312 /* All of GLSL only produces predicates for COND_NE and one channel per
1313 * vector. Fail badly if someone starts doing something else, as it might
1314 * mean infinite looping or something.
1316 * We'd like to support all the condition codes, but our hardware doesn't
1317 * quite match the Mesa IR, which is modeled after the NV extensions. For
1318 * those, the instruction may update the condition codes or not, then any
1319 * later instruction may use one of those condition codes. For gen4, the
1320 * instruction may update the flags register based on one of the condition
1321 * codes output by the instruction, and then further instructions may
1322 * predicate on that. We can probably support this, but it won't
1323 * necessarily be easy.
1325 assert(inst
->DstReg
.CondMask
== COND_NE
);
1327 switch (inst
->DstReg
.CondSwizzle
) {
1329 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1331 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1333 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1335 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1337 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1338 inst
->DstReg
.CondMask
);
1339 return BRW_PREDICATE_NORMAL
;
1343 /* Emit the vertex program instructions here.
1345 void brw_vs_emit(struct brw_vs_compile
*c
)
1347 #define MAX_IF_DEPTH 32
1348 #define MAX_LOOP_DEPTH 32
1349 struct brw_compile
*p
= &c
->func
;
1350 struct brw_context
*brw
= p
->brw
;
1351 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1352 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1353 GLuint end_offset
= 0;
1354 struct brw_instruction
*end_inst
, *last_inst
;
1355 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1356 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1360 if (INTEL_DEBUG
& DEBUG_VS
) {
1361 _mesa_printf("vs-mesa:\n");
1362 _mesa_print_program(&c
->vp
->program
.Base
);
1366 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1367 brw_set_access_mode(p
, BRW_ALIGN_16
);
1369 /* Message registers can't be read, so copy the output into GRF register
1370 if they are used in source registers */
1371 for (insn
= 0; insn
< nr_insns
; insn
++) {
1373 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1374 for (i
= 0; i
< 3; i
++) {
1375 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1376 GLuint index
= src
->Index
;
1377 GLuint file
= src
->File
;
1378 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1379 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1383 /* Static register allocation
1385 brw_vs_alloc_regs(c
);
1386 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1388 for (insn
= 0; insn
< nr_insns
; insn
++) {
1390 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1391 struct brw_reg args
[3], dst
;
1395 printf("%d: ", insn
);
1396 _mesa_print_instruction(inst
);
1399 /* Get argument regs. SWZ is special and does this itself.
1401 if (inst
->Opcode
!= OPCODE_SWZ
)
1402 for (i
= 0; i
< 3; i
++) {
1403 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1406 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1407 args
[i
] = c
->output_regs
[index
].reg
;
1409 args
[i
] = get_arg(c
, inst
, i
);
1412 /* Get dest regs. Note that it is possible for a reg to be both
1413 * dst and arg, given the static allocation of registers. So
1414 * care needs to be taken emitting multi-operation instructions.
1416 index
= inst
->DstReg
.Index
;
1417 file
= inst
->DstReg
.File
;
1418 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1419 dst
= c
->output_regs
[index
].reg
;
1421 dst
= get_dst(c
, inst
->DstReg
);
1423 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1424 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1425 inst
->SaturateMode
);
1428 switch (inst
->Opcode
) {
1430 brw_MOV(p
, dst
, brw_abs(args
[0]));
1433 brw_ADD(p
, dst
, args
[0], args
[1]);
1436 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1439 brw_DP3(p
, dst
, args
[0], args
[1]);
1442 brw_DP4(p
, dst
, args
[0], args
[1]);
1445 brw_DPH(p
, dst
, args
[0], args
[1]);
1448 emit_nrm(c
, dst
, args
[0], 3);
1451 emit_nrm(c
, dst
, args
[0], 4);
1454 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1457 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1460 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1463 emit_arl(c
, dst
, args
[0]);
1466 brw_RNDD(p
, dst
, args
[0]);
1469 brw_FRC(p
, dst
, args
[0]);
1472 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1475 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1478 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1481 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1484 if (!accumulator_contains(c
, args
[2]))
1485 brw_MOV(p
, brw_acc_reg(), args
[2]);
1486 brw_MAC(p
, dst
, args
[0], args
[1]);
1489 emit_max(p
, dst
, args
[0], args
[1]);
1492 emit_min(p
, dst
, args
[0], args
[1]);
1495 brw_MOV(p
, dst
, args
[0]);
1498 brw_MUL(p
, dst
, args
[0], args
[1]);
1501 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1504 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1507 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1511 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1514 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1517 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1520 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1523 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1526 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1529 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1532 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1535 /* The args[0] value can't be used here as it won't have
1536 * correctly encoded the full swizzle:
1538 emit_swz(c
, dst
, inst
);
1541 /* round toward zero */
1542 brw_RNDZ(p
, dst
, args
[0]);
1545 emit_xpd(p
, dst
, args
[0], args
[1]);
1548 assert(if_depth
< MAX_IF_DEPTH
);
1549 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1550 /* Note that brw_IF smashes the predicate_control field. */
1551 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1555 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1558 assert(if_depth
> 0);
1559 brw_ENDIF(p
, if_inst
[--if_depth
]);
1561 case OPCODE_BGNLOOP
:
1562 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1565 brw_set_predicate_control(p
, get_predicate(inst
));
1567 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1570 brw_set_predicate_control(p
, get_predicate(inst
));
1572 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1574 case OPCODE_ENDLOOP
:
1576 struct brw_instruction
*inst0
, *inst1
;
1581 if (BRW_IS_IGDNG(brw
))
1584 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1585 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1586 while (inst0
> loop_inst
[loop_depth
]) {
1588 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
1589 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1590 inst0
->bits3
.if_else
.pop_count
= 0;
1592 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
1593 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1594 inst0
->bits3
.if_else
.pop_count
= 0;
1600 brw_set_predicate_control(p
, get_predicate(inst
));
1601 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1602 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1605 brw_set_access_mode(p
, BRW_ALIGN_1
);
1606 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1607 brw_set_access_mode(p
, BRW_ALIGN_16
);
1608 brw_ADD(p
, get_addr_reg(stack_index
),
1609 get_addr_reg(stack_index
), brw_imm_d(4));
1610 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1611 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1614 brw_ADD(p
, get_addr_reg(stack_index
),
1615 get_addr_reg(stack_index
), brw_imm_d(-4));
1616 brw_set_access_mode(p
, BRW_ALIGN_1
);
1617 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1618 brw_set_access_mode(p
, BRW_ALIGN_16
);
1621 end_offset
= p
->nr_insn
;
1622 /* this instruction will get patched later to jump past subroutine
1625 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1631 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1637 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1638 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1639 _mesa_opcode_string(inst
->Opcode
) :
1643 /* Set the predication update on the last instruction of the native
1644 * instruction sequence.
1646 * This would be problematic if it was set on a math instruction,
1647 * but that shouldn't be the case with the current GLSL compiler.
1649 if (inst
->CondUpdate
) {
1650 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1652 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1653 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1656 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1657 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1658 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1659 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1662 /* Result color clamping.
1664 * When destination register is an output register and
1665 * it's primary/secondary front/back color, we have to clamp
1666 * the result to [0,1]. This is done by enabling the
1667 * saturation bit for the last instruction.
1669 * We don't use brw_set_saturate() as it modifies
1670 * p->current->header.saturate, which affects all the subsequent
1671 * instructions. Instead, we directly modify the header
1672 * of the last (already stored) instruction.
1674 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1675 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1676 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1677 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1678 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1679 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1686 end_inst
= &p
->store
[end_offset
];
1687 last_inst
= &p
->store
[p
->nr_insn
];
1689 /* The END instruction will be patched to jump to this code */
1690 emit_vertex_write(c
);
1692 post_vs_emit(c
, end_inst
, last_inst
);
1694 if (INTEL_DEBUG
& DEBUG_VS
) {
1697 _mesa_printf("vs-native:\n");
1698 for (i
= 0; i
< p
->nr_insn
; i
++)
1699 brw_disasm(stderr
, &p
->store
[i
]);