2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
64 /* These opcodes get broken down in a way that allow two
65 * args to be immediates.
67 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
68 if (arg
== 1 || arg
== 2)
72 if (opcode
> ARRAY_SIZE(opcode_array
))
75 return arg
== opcode_array
[opcode
] - 1;
78 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
80 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
82 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
83 c
->prog_data
.total_grf
= c
->last_tmp
;
88 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
90 if (tmp
.nr
== c
->last_tmp
-1)
94 static void release_tmps( struct brw_vs_compile
*c
)
96 c
->last_tmp
= c
->first_tmp
;
101 * Preallocate GRF register before code emit.
102 * Do things as simply as possible. Allocate and populate all regs
105 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
107 struct intel_context
*intel
= &c
->func
.brw
->intel
;
108 GLuint i
, reg
= 0, mrf
;
109 int attributes_in_vue
;
111 /* Determine whether to use a real constant buffer or use a block
112 * of GRF registers for constants. The later is faster but only
113 * works if everything fits in the GRF.
114 * XXX this heuristic/check may need some fine tuning...
116 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
117 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
118 c
->vp
->use_const_buffer
= GL_TRUE
;
120 c
->vp
->use_const_buffer
= GL_FALSE
;
122 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
124 /* r0 -- reserved as usual
126 c
->r0
= brw_vec8_grf(reg
, 0);
129 /* User clip planes from curbe:
131 if (c
->key
.nr_userclip
) {
132 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
133 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
136 /* Deal with curbe alignment:
138 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
141 /* Vertex program parameters from curbe:
143 if (c
->vp
->use_const_buffer
) {
144 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
147 /* We've got more constants than we can load with the push
148 * mechanism. This is often correlated with reladdr loads where
149 * we should probably be using a pull mechanism anyway to avoid
150 * excessive reading. However, the pull mechanism is slow in
151 * general. So, we try to allocate as many non-reladdr-loaded
152 * constants through the push buffer as we can before giving up.
154 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
156 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
158 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
161 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
162 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
163 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
164 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
165 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
166 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
167 inst
->SrcReg
[arg
].RelAddr
)
170 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
171 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
176 for (i
= 0; i
< constant
; i
++) {
177 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
181 reg
+= (constant
+ 1) / 2;
182 c
->prog_data
.curb_read_length
= reg
- 1;
183 /* XXX 0 causes a bug elsewhere... */
184 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
187 /* use a section of the GRF for constants */
188 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
189 for (i
= 0; i
< nr_params
; i
++) {
190 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
192 reg
+= (nr_params
+ 1) / 2;
193 c
->prog_data
.curb_read_length
= reg
- 1;
195 c
->prog_data
.nr_params
= nr_params
* 4;
198 /* Allocate input regs:
201 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
202 if (c
->prog_data
.inputs_read
& (1 << i
)) {
204 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
208 /* If there are no inputs, we'll still be reading one attribute's worth
209 * because it's required -- see urb_read_length setting.
211 if (c
->nr_inputs
== 0)
214 /* Allocate outputs. The non-position outputs go straight into message regs.
217 c
->first_output
= reg
;
218 c
->first_overflow_output
= 0;
222 else if (intel
->gen
== 5)
227 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
228 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
230 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
231 if (i
== VERT_RESULT_HPOS
) {
232 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
235 else if (i
== VERT_RESULT_PSIZ
) {
236 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
238 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
242 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
246 /* too many vertex results to fit in MRF, use GRF for overflow */
247 if (!c
->first_overflow_output
)
248 c
->first_overflow_output
= i
;
249 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
256 /* Allocate program temporaries:
258 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
259 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
263 /* Address reg(s). Don't try to use the internal address reg until
266 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
267 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
271 BRW_VERTICAL_STRIDE_8
,
273 BRW_HORIZONTAL_STRIDE_1
,
279 if (c
->vp
->use_const_buffer
) {
280 for (i
= 0; i
< 3; i
++) {
281 c
->current_const
[i
].index
= -1;
282 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
287 for (i
= 0; i
< 128; i
++) {
288 if (c
->output_regs
[i
].used_in_src
) {
289 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
294 if (c
->needs_stack
) {
295 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
299 /* Some opcodes need an internal temporary:
302 c
->last_tmp
= reg
; /* for allocation purposes */
304 /* Each input reg holds data from two vertices. The
305 * urb_read_length is the number of registers read from *each*
306 * vertex urb, so is half the amount:
308 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
309 /* Setting this field to 0 leads to undefined behavior according to the
310 * the VS_STATE docs. Our VUEs will always have at least one attribute
311 * sitting in them, even if it's padding.
313 if (c
->prog_data
.urb_read_length
== 0)
314 c
->prog_data
.urb_read_length
= 1;
316 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
317 * them to fit the biggest thing they need to.
319 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
322 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 4 + 7) / 8;
323 else if (intel
->gen
== 5)
324 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
326 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
328 c
->prog_data
.total_grf
= reg
;
330 if (INTEL_DEBUG
& DEBUG_VS
) {
331 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
332 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
333 printf("%s reg = %d\n", __FUNCTION__
, reg
);
339 * If an instruction uses a temp reg both as a src and the dest, we
340 * sometimes need to allocate an intermediate temporary.
342 static void unalias1( struct brw_vs_compile
*c
,
345 void (*func
)( struct brw_vs_compile
*,
349 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
350 struct brw_compile
*p
= &c
->func
;
351 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
353 brw_MOV(p
, dst
, tmp
);
363 * Checkes if 2-operand instruction needs an intermediate temporary.
365 static void unalias2( struct brw_vs_compile
*c
,
369 void (*func
)( struct brw_vs_compile
*,
374 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
375 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
376 struct brw_compile
*p
= &c
->func
;
377 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
378 func(c
, tmp
, arg0
, arg1
);
379 brw_MOV(p
, dst
, tmp
);
383 func(c
, dst
, arg0
, arg1
);
389 * Checkes if 3-operand instruction needs an intermediate temporary.
391 static void unalias3( struct brw_vs_compile
*c
,
396 void (*func
)( struct brw_vs_compile
*,
402 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
403 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
404 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
405 struct brw_compile
*p
= &c
->func
;
406 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
407 func(c
, tmp
, arg0
, arg1
, arg2
);
408 brw_MOV(p
, dst
, tmp
);
412 func(c
, dst
, arg0
, arg1
, arg2
);
416 static void emit_sop( struct brw_vs_compile
*c
,
422 struct brw_compile
*p
= &c
->func
;
424 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
425 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
426 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
427 brw_set_predicate_control_flag_value(p
, 0xff);
430 static void emit_seq( struct brw_vs_compile
*c
,
433 struct brw_reg arg1
)
435 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
438 static void emit_sne( struct brw_vs_compile
*c
,
441 struct brw_reg arg1
)
443 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
445 static void emit_slt( struct brw_vs_compile
*c
,
448 struct brw_reg arg1
)
450 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
453 static void emit_sle( struct brw_vs_compile
*c
,
456 struct brw_reg arg1
)
458 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
461 static void emit_sgt( struct brw_vs_compile
*c
,
464 struct brw_reg arg1
)
466 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
469 static void emit_sge( struct brw_vs_compile
*c
,
472 struct brw_reg arg1
)
474 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
477 static void emit_cmp( struct brw_compile
*p
,
481 struct brw_reg arg2
)
483 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
484 brw_SEL(p
, dst
, arg1
, arg2
);
485 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
488 static void emit_sign(struct brw_vs_compile
*c
,
492 struct brw_compile
*p
= &c
->func
;
494 brw_MOV(p
, dst
, brw_imm_f(0));
496 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
497 brw_MOV(p
, dst
, brw_imm_f(-1.0));
498 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
500 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
501 brw_MOV(p
, dst
, brw_imm_f(1.0));
502 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
505 static void emit_max( struct brw_compile
*p
,
508 struct brw_reg arg1
)
510 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
511 brw_SEL(p
, dst
, arg0
, arg1
);
512 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
515 static void emit_min( struct brw_compile
*p
,
518 struct brw_reg arg1
)
520 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
521 brw_SEL(p
, dst
, arg0
, arg1
);
522 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
526 static void emit_math1( struct brw_vs_compile
*c
,
532 /* There are various odd behaviours with SEND on the simulator. In
533 * addition there are documented issues with the fact that the GEN4
534 * processor doesn't do dependency control properly on SEND
535 * results. So, on balance, this kludge to get around failures
536 * with writemasked math results looks like it might be necessary
537 * whether that turns out to be a simulator bug or not:
539 struct brw_compile
*p
= &c
->func
;
540 struct intel_context
*intel
= &p
->brw
->intel
;
541 struct brw_reg tmp
= dst
;
542 GLboolean need_tmp
= (intel
->gen
< 6 &&
543 (dst
.dw1
.bits
.writemask
!= 0xf ||
544 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
552 BRW_MATH_SATURATE_NONE
,
555 BRW_MATH_DATA_SCALAR
,
559 brw_MOV(p
, dst
, tmp
);
565 static void emit_math2( struct brw_vs_compile
*c
,
572 struct brw_compile
*p
= &c
->func
;
573 struct intel_context
*intel
= &p
->brw
->intel
;
574 struct brw_reg tmp
= dst
;
575 GLboolean need_tmp
= (intel
->gen
< 6 &&
576 (dst
.dw1
.bits
.writemask
!= 0xf ||
577 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
582 brw_MOV(p
, brw_message_reg(3), arg1
);
587 BRW_MATH_SATURATE_NONE
,
590 BRW_MATH_DATA_SCALAR
,
594 brw_MOV(p
, dst
, tmp
);
600 static void emit_exp_noalias( struct brw_vs_compile
*c
,
602 struct brw_reg arg0
)
604 struct brw_compile
*p
= &c
->func
;
607 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
608 struct brw_reg tmp
= get_tmp(c
);
609 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
611 /* tmp_d = floor(arg0.x) */
612 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
614 /* result[0] = 2.0 ^ tmp */
616 /* Adjust exponent for floating point:
619 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
621 /* Install exponent and sign.
622 * Excess drops off the edge:
624 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
625 tmp_d
, brw_imm_d(23));
630 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
631 /* result[1] = arg0.x - floor(arg0.x) */
632 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
635 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
636 /* As with the LOG instruction, we might be better off just
637 * doing a taylor expansion here, seeing as we have to do all
640 * If mathbox partial precision is too low, consider also:
641 * result[3] = result[0] * EXP(result[1])
644 BRW_MATH_FUNCTION_EXP
,
645 brw_writemask(dst
, WRITEMASK_Z
),
646 brw_swizzle1(arg0
, 0),
647 BRW_MATH_PRECISION_FULL
);
650 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
651 /* result[3] = 1.0; */
652 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
657 static void emit_log_noalias( struct brw_vs_compile
*c
,
659 struct brw_reg arg0
)
661 struct brw_compile
*p
= &c
->func
;
662 struct brw_reg tmp
= dst
;
663 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
664 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
665 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
666 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
670 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
673 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
676 * These almost look likey they could be joined up, but not really
679 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
680 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
682 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
684 brw_writemask(tmp_ud
, WRITEMASK_X
),
685 brw_swizzle1(arg0_ud
, 0),
686 brw_imm_ud((1U<<31)-1));
689 brw_writemask(tmp_ud
, WRITEMASK_X
),
694 brw_writemask(tmp
, WRITEMASK_X
),
695 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
699 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
701 brw_writemask(tmp_ud
, WRITEMASK_Y
),
702 brw_swizzle1(arg0_ud
, 0),
703 brw_imm_ud((1<<23)-1));
706 brw_writemask(tmp_ud
, WRITEMASK_Y
),
708 brw_imm_ud(127<<23));
711 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
712 /* result[2] = result[0] + LOG2(result[1]); */
714 /* Why bother? The above is just a hint how to do this with a
715 * taylor series. Maybe we *should* use a taylor series as by
716 * the time all the above has been done it's almost certainly
717 * quicker than calling the mathbox, even with low precision.
720 * - result[0] + mathbox.LOG2(result[1])
721 * - mathbox.LOG2(arg0.x)
722 * - result[0] + inline_taylor_approx(result[1])
725 BRW_MATH_FUNCTION_LOG
,
726 brw_writemask(tmp
, WRITEMASK_Z
),
727 brw_swizzle1(tmp
, 1),
728 BRW_MATH_PRECISION_FULL
);
731 brw_writemask(tmp
, WRITEMASK_Z
),
732 brw_swizzle1(tmp
, 2),
733 brw_swizzle1(tmp
, 0));
736 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
737 /* result[3] = 1.0; */
738 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
742 brw_MOV(p
, dst
, tmp
);
748 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
750 static void emit_dst_noalias( struct brw_vs_compile
*c
,
755 struct brw_compile
*p
= &c
->func
;
757 /* There must be a better way to do this:
759 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
760 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
761 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
762 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
763 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
764 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
765 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
766 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
770 static void emit_xpd( struct brw_compile
*p
,
775 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
776 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
780 static void emit_lit_noalias( struct brw_vs_compile
*c
,
782 struct brw_reg arg0
)
784 struct brw_compile
*p
= &c
->func
;
785 struct brw_instruction
*if_insn
;
786 struct brw_reg tmp
= dst
;
787 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
792 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
793 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
795 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
796 * to get all channels active inside the IF. In the clipping code
797 * we run with NoMask, so it's not an option and we can use
798 * BRW_EXECUTE_1 for all comparisions.
800 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
801 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
803 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
805 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
806 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
807 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
810 BRW_MATH_FUNCTION_POW
,
811 brw_writemask(dst
, WRITEMASK_Z
),
812 brw_swizzle1(tmp
, 2),
813 brw_swizzle1(arg0
, 3),
814 BRW_MATH_PRECISION_PARTIAL
);
817 brw_ENDIF(p
, if_insn
);
822 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
828 struct brw_compile
*p
= &c
->func
;
830 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
831 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
832 brw_MAC(p
, dst
, arg0
, arg1
);
835 /** 3 or 4-component vector normalization */
836 static void emit_nrm( struct brw_vs_compile
*c
,
841 struct brw_compile
*p
= &c
->func
;
842 struct brw_reg tmp
= get_tmp(c
);
844 /* tmp = dot(arg0, arg0) */
846 brw_DP3(p
, tmp
, arg0
, arg0
);
848 brw_DP4(p
, tmp
, arg0
, arg0
);
850 /* tmp = 1 / sqrt(tmp) */
851 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
853 /* dst = arg0 * tmp */
854 brw_MUL(p
, dst
, arg0
, tmp
);
860 static struct brw_reg
861 get_constant(struct brw_vs_compile
*c
,
862 const struct prog_instruction
*inst
,
865 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
866 struct brw_compile
*p
= &c
->func
;
867 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
869 assert(argIndex
< 3);
871 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
872 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
874 /* Keep track of the last constant loaded in this slot, for reuse. */
875 c
->current_const
[argIndex
].index
= src
->Index
;
878 printf(" fetch const[%d] for arg %d into reg %d\n",
879 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
881 /* need to fetch the constant now */
883 const_reg
, /* writeback dest */
885 0, /* relative indexing? */
886 addrReg
, /* address register */
887 16 * src
->Index
, /* byte offset */
888 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
892 /* replicate lower four floats into upper half (to get XYZWXYZW) */
893 const_reg
= stride(const_reg
, 0, 4, 0);
899 static struct brw_reg
900 get_reladdr_constant(struct brw_vs_compile
*c
,
901 const struct prog_instruction
*inst
,
904 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
905 struct brw_compile
*p
= &c
->func
;
906 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
907 struct brw_reg const2_reg
;
908 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
910 assert(argIndex
< 3);
912 /* Can't reuse a reladdr constant load. */
913 c
->current_const
[argIndex
].index
= -1;
916 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
917 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
920 /* fetch the first vec4 */
922 const_reg
, /* writeback dest */
924 1, /* relative indexing? */
925 addrReg
, /* address register */
926 16 * src
->Index
, /* byte offset */
927 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
930 const2_reg
= get_tmp(c
);
932 /* use upper half of address reg for second read */
933 addrReg
= stride(addrReg
, 0, 4, 0);
937 const2_reg
, /* writeback dest */
939 1, /* relative indexing? */
940 addrReg
, /* address register */
941 16 * src
->Index
, /* byte offset */
942 SURF_INDEX_VERT_CONST_BUFFER
945 /* merge the two Owords into the constant register */
946 /* const_reg[7..4] = const2_reg[7..4] */
948 suboffset(stride(const_reg
, 0, 4, 1), 4),
949 suboffset(stride(const2_reg
, 0, 4, 1), 4));
950 release_tmp(c
, const2_reg
);
957 /* TODO: relative addressing!
959 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
960 gl_register_file file
,
964 case PROGRAM_TEMPORARY
:
967 assert(c
->regs
[file
][index
].nr
!= 0);
968 return c
->regs
[file
][index
];
969 case PROGRAM_STATE_VAR
:
970 case PROGRAM_CONSTANT
:
971 case PROGRAM_UNIFORM
:
972 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
973 return c
->regs
[PROGRAM_STATE_VAR
][index
];
974 case PROGRAM_ADDRESS
:
976 return c
->regs
[file
][index
];
978 case PROGRAM_UNDEFINED
: /* undef values */
979 return brw_null_reg();
981 case PROGRAM_LOCAL_PARAM
:
982 case PROGRAM_ENV_PARAM
:
983 case PROGRAM_WRITE_ONLY
:
986 return brw_null_reg();
992 * Indirect addressing: get reg[[arg] + offset].
994 static struct brw_reg
deref( struct brw_vs_compile
*c
,
998 struct brw_compile
*p
= &c
->func
;
999 struct brw_reg tmp
= vec4(get_tmp(c
));
1000 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1001 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
1002 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
1003 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1006 brw_push_insn_state(p
);
1007 brw_set_access_mode(p
, BRW_ALIGN_1
);
1009 /* This is pretty clunky - load the address register twice and
1010 * fetch each 4-dword value in turn. There must be a way to do
1011 * this in a single pass, but I couldn't get it to work.
1013 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
1014 brw_MOV(p
, tmp
, indirect
);
1016 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
1017 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
1019 brw_pop_insn_state(p
);
1022 /* NOTE: tmp not released */
1028 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1029 * TODO: relative addressing!
1031 static struct brw_reg
1032 get_src_reg( struct brw_vs_compile
*c
,
1033 const struct prog_instruction
*inst
,
1036 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1037 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1038 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1040 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1041 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1043 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1047 return brw_imm_f(0.0f
);
1048 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1053 return brw_imm_f(-1.0F
);
1055 return brw_imm_f(1.0F
);
1056 } else if (src
->File
== PROGRAM_CONSTANT
) {
1057 const struct gl_program_parameter_list
*params
;
1061 switch (src
->Swizzle
) {
1076 if (component
>= 0) {
1077 params
= c
->vp
->program
.Base
.Parameters
;
1078 f
= params
->ParameterValues
[src
->Index
][component
];
1084 return brw_imm_f(f
);
1090 case PROGRAM_TEMPORARY
:
1092 case PROGRAM_OUTPUT
:
1094 return deref(c
, c
->regs
[file
][0], index
);
1097 assert(c
->regs
[file
][index
].nr
!= 0);
1098 return c
->regs
[file
][index
];
1101 case PROGRAM_STATE_VAR
:
1102 case PROGRAM_CONSTANT
:
1103 case PROGRAM_UNIFORM
:
1104 case PROGRAM_ENV_PARAM
:
1105 case PROGRAM_LOCAL_PARAM
:
1106 if (c
->vp
->use_const_buffer
) {
1107 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1108 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1109 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1111 return get_reladdr_constant(c
, inst
, argIndex
);
1113 return get_constant(c
, inst
, argIndex
);
1116 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
1119 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1120 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1122 case PROGRAM_ADDRESS
:
1124 return c
->regs
[file
][index
];
1126 case PROGRAM_UNDEFINED
:
1127 /* this is a normal case since we loop over all three src args */
1128 return brw_null_reg();
1130 case PROGRAM_WRITE_ONLY
:
1133 return brw_null_reg();
1138 static void emit_arl( struct brw_vs_compile
*c
,
1140 struct brw_reg arg0
)
1142 struct brw_compile
*p
= &c
->func
;
1143 struct brw_reg tmp
= dst
;
1144 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1149 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
1150 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
1153 release_tmp(c
, tmp
);
1158 * Return the brw reg for the given instruction's src argument.
1159 * Will return mangled results for SWZ op. The emit_swz() function
1160 * ignores this result and recalculates taking extended swizzles into
1163 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1164 const struct prog_instruction
*inst
,
1167 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1170 if (src
->File
== PROGRAM_UNDEFINED
)
1171 return brw_null_reg();
1173 reg
= get_src_reg(c
, inst
, argIndex
);
1175 /* Convert 3-bit swizzle to 2-bit.
1177 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1178 GET_SWZ(src
->Swizzle
, 1),
1179 GET_SWZ(src
->Swizzle
, 2),
1180 GET_SWZ(src
->Swizzle
, 3));
1182 /* Note this is ok for non-swizzle instructions:
1184 reg
.negate
= src
->Negate
? 1 : 0;
1191 * Get brw register for the given program dest register.
1193 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1194 struct prog_dst_register dst
)
1199 case PROGRAM_TEMPORARY
:
1200 case PROGRAM_OUTPUT
:
1201 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1202 reg
= c
->regs
[dst
.File
][dst
.Index
];
1204 case PROGRAM_ADDRESS
:
1205 assert(dst
.Index
== 0);
1206 reg
= c
->regs
[dst
.File
][dst
.Index
];
1208 case PROGRAM_UNDEFINED
:
1209 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1210 reg
= brw_null_reg();
1214 reg
= brw_null_reg();
1217 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1223 static void emit_swz( struct brw_vs_compile
*c
,
1225 const struct prog_instruction
*inst
)
1227 const GLuint argIndex
= 0;
1228 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1229 struct brw_compile
*p
= &c
->func
;
1230 GLuint zeros_mask
= 0;
1231 GLuint ones_mask
= 0;
1232 GLuint src_mask
= 0;
1234 GLboolean need_tmp
= (src
.Negate
&&
1235 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1236 struct brw_reg tmp
= dst
;
1242 for (i
= 0; i
< 4; i
++) {
1243 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1244 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1263 /* Do src first, in case dst aliases src:
1266 struct brw_reg arg0
;
1268 arg0
= get_src_reg(c
, inst
, argIndex
);
1270 arg0
= brw_swizzle(arg0
,
1271 src_swz
[0], src_swz
[1],
1272 src_swz
[2], src_swz
[3]);
1274 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1278 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1281 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1284 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1287 brw_MOV(p
, dst
, tmp
);
1288 release_tmp(c
, tmp
);
1294 * Post-vertex-program processing. Send the results to the URB.
1296 static void emit_vertex_write( struct brw_vs_compile
*c
)
1298 struct brw_compile
*p
= &c
->func
;
1299 struct brw_context
*brw
= p
->brw
;
1300 struct intel_context
*intel
= &brw
->intel
;
1301 struct brw_reg m0
= brw_message_reg(0);
1302 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1305 GLuint len_vertex_header
= 2;
1307 if (c
->key
.copy_edgeflag
) {
1309 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1310 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1313 if (intel
->gen
< 6) {
1314 /* Build ndc coords */
1316 /* ndc = 1.0 / pos.w */
1317 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1318 /* ndc.xyz = pos * ndc */
1319 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1322 /* Update the header for point size, user clipping flags, and -ve rhw
1325 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1326 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
)
1328 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1331 brw_MOV(p
, header1
, brw_imm_ud(0));
1333 brw_set_access_mode(p
, BRW_ALIGN_16
);
1335 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1336 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1337 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1338 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1341 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1342 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1343 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1344 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1345 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1348 /* i965 clipping workaround:
1349 * 1) Test for -ve rhw
1351 * set ndc = (0,0,0,0)
1354 * Later, clipping will detect ucp[6] and ensure the primitive is
1355 * clipped against all fixed planes.
1357 if (brw
->has_negative_rhw_bug
) {
1359 vec8(brw_null_reg()),
1361 brw_swizzle1(ndc
, 3),
1364 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1365 brw_MOV(p
, ndc
, brw_imm_f(0));
1366 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1369 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1370 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1371 brw_set_access_mode(p
, BRW_ALIGN_16
);
1373 release_tmp(c
, header1
);
1376 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1379 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1380 * of zeros followed by two sets of NDC coordinates:
1382 brw_set_access_mode(p
, BRW_ALIGN_1
);
1384 if (intel
->gen
>= 6) {
1385 /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
1386 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1387 * dword 4-7 (m2) is the 4D space position
1388 * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
1389 * m5 is the first vertex data we fill, which is the vertex position.
1391 brw_MOV(p
, offset(m0
, 2), pos
);
1392 brw_MOV(p
, offset(m0
, 5), pos
);
1393 len_vertex_header
= 4;
1394 } else if (intel
->gen
== 5) {
1395 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1396 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1397 * dword 4-7 (m2) is the ndc position (set above)
1398 * dword 8-11 (m3) of the vertex header is the 4D space position
1399 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1400 * m6 is a pad so that the vertex element data is aligned
1401 * m7 is the first vertex data we fill, which is the vertex position.
1403 brw_MOV(p
, offset(m0
, 2), ndc
);
1404 brw_MOV(p
, offset(m0
, 3), pos
);
1405 brw_MOV(p
, offset(m0
, 7), pos
);
1406 len_vertex_header
= 6;
1408 /* There are 8 dwords in VUE header pre-Ironlake:
1409 * dword 0-3 (m1) is indices, point width, clip flags.
1410 * dword 4-7 (m2) is ndc position (set above)
1412 * dword 8-11 (m3) is the first vertex data, which we always have be the
1415 brw_MOV(p
, offset(m0
, 2), ndc
);
1416 brw_MOV(p
, offset(m0
, 3), pos
);
1417 len_vertex_header
= 2;
1420 eot
= (c
->first_overflow_output
== 0);
1423 brw_null_reg(), /* dest */
1424 0, /* starting mrf reg nr */
1428 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1429 0, /* response len */
1431 eot
, /* writes complete */
1432 0, /* urb destination offset */
1433 BRW_URB_SWIZZLE_INTERLEAVE
);
1435 if (c
->first_overflow_output
> 0) {
1436 /* Not all of the vertex outputs/results fit into the MRF.
1437 * Move the overflowed attributes from the GRF to the MRF and
1438 * issue another brw_urb_WRITE().
1440 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1444 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1445 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1446 /* move from GRF to MRF */
1447 brw_MOV(p
, brw_message_reg(4+mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1453 brw_null_reg(), /* dest */
1454 4, /* starting mrf reg nr */
1458 mrf
+1, /* msg len */
1459 0, /* response len */
1461 1, /* writes complete */
1462 BRW_MAX_MRF
-1, /* urb destination offset */
1463 BRW_URB_SWIZZLE_INTERLEAVE
);
1468 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1470 struct brw_compile
*p
= &c
->func
;
1471 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1473 if (p
->nr_insn
== 0)
1476 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1479 switch (prev_insn
->header
.opcode
) {
1480 case BRW_OPCODE_MOV
:
1481 case BRW_OPCODE_MAC
:
1482 case BRW_OPCODE_MUL
:
1483 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1484 prev_insn
->header
.execution_size
== val
.width
&&
1485 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1486 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1487 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1488 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1489 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1490 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1500 get_predicate(const struct prog_instruction
*inst
)
1502 if (inst
->DstReg
.CondMask
== COND_TR
)
1503 return BRW_PREDICATE_NONE
;
1505 /* All of GLSL only produces predicates for COND_NE and one channel per
1506 * vector. Fail badly if someone starts doing something else, as it might
1507 * mean infinite looping or something.
1509 * We'd like to support all the condition codes, but our hardware doesn't
1510 * quite match the Mesa IR, which is modeled after the NV extensions. For
1511 * those, the instruction may update the condition codes or not, then any
1512 * later instruction may use one of those condition codes. For gen4, the
1513 * instruction may update the flags register based on one of the condition
1514 * codes output by the instruction, and then further instructions may
1515 * predicate on that. We can probably support this, but it won't
1516 * necessarily be easy.
1518 assert(inst
->DstReg
.CondMask
== COND_NE
);
1520 switch (inst
->DstReg
.CondSwizzle
) {
1522 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1524 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1526 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1528 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1530 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1531 inst
->DstReg
.CondMask
);
1532 return BRW_PREDICATE_NORMAL
;
1536 /* Emit the vertex program instructions here.
1538 void brw_vs_emit(struct brw_vs_compile
*c
)
1540 #define MAX_IF_DEPTH 32
1541 #define MAX_LOOP_DEPTH 32
1542 struct brw_compile
*p
= &c
->func
;
1543 struct brw_context
*brw
= p
->brw
;
1544 struct intel_context
*intel
= &brw
->intel
;
1545 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1546 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1547 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1548 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1552 if (INTEL_DEBUG
& DEBUG_VS
) {
1553 printf("vs-mesa:\n");
1554 _mesa_print_program(&c
->vp
->program
.Base
);
1558 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1559 brw_set_access_mode(p
, BRW_ALIGN_16
);
1561 for (insn
= 0; insn
< nr_insns
; insn
++) {
1563 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1565 /* Message registers can't be read, so copy the output into GRF
1566 * register if they are used in source registers
1568 for (i
= 0; i
< 3; i
++) {
1569 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1570 GLuint index
= src
->Index
;
1571 GLuint file
= src
->File
;
1572 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1573 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1576 switch (inst
->Opcode
) {
1579 c
->needs_stack
= GL_TRUE
;
1586 /* Static register allocation
1588 brw_vs_alloc_regs(c
);
1591 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1593 for (insn
= 0; insn
< nr_insns
; insn
++) {
1595 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1596 struct brw_reg args
[3], dst
;
1600 printf("%d: ", insn
);
1601 _mesa_print_instruction(inst
);
1604 /* Get argument regs. SWZ is special and does this itself.
1606 if (inst
->Opcode
!= OPCODE_SWZ
)
1607 for (i
= 0; i
< 3; i
++) {
1608 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1611 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1612 args
[i
] = c
->output_regs
[index
].reg
;
1614 args
[i
] = get_arg(c
, inst
, i
);
1617 /* Get dest regs. Note that it is possible for a reg to be both
1618 * dst and arg, given the static allocation of registers. So
1619 * care needs to be taken emitting multi-operation instructions.
1621 index
= inst
->DstReg
.Index
;
1622 file
= inst
->DstReg
.File
;
1623 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1624 dst
= c
->output_regs
[index
].reg
;
1626 dst
= get_dst(c
, inst
->DstReg
);
1628 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1629 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1630 inst
->SaturateMode
);
1633 switch (inst
->Opcode
) {
1635 brw_MOV(p
, dst
, brw_abs(args
[0]));
1638 brw_ADD(p
, dst
, args
[0], args
[1]);
1641 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1644 brw_DP3(p
, dst
, args
[0], args
[1]);
1647 brw_DP4(p
, dst
, args
[0], args
[1]);
1650 brw_DPH(p
, dst
, args
[0], args
[1]);
1653 emit_nrm(c
, dst
, args
[0], 3);
1656 emit_nrm(c
, dst
, args
[0], 4);
1659 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1662 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1665 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1668 emit_arl(c
, dst
, args
[0]);
1671 brw_RNDD(p
, dst
, args
[0]);
1674 brw_FRC(p
, dst
, args
[0]);
1677 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1680 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1683 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1686 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1689 if (!accumulator_contains(c
, args
[2]))
1690 brw_MOV(p
, brw_acc_reg(), args
[2]);
1691 brw_MAC(p
, dst
, args
[0], args
[1]);
1694 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1697 emit_max(p
, dst
, args
[0], args
[1]);
1700 emit_min(p
, dst
, args
[0], args
[1]);
1703 brw_MOV(p
, dst
, args
[0]);
1706 brw_MUL(p
, dst
, args
[0], args
[1]);
1709 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1712 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1715 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1719 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1722 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1725 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1728 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1731 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1734 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1737 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1740 unalias1(c
, dst
, args
[0], emit_sign
);
1743 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1746 /* The args[0] value can't be used here as it won't have
1747 * correctly encoded the full swizzle:
1749 emit_swz(c
, dst
, inst
);
1752 /* round toward zero */
1753 brw_RNDZ(p
, dst
, args
[0]);
1756 emit_xpd(p
, dst
, args
[0], args
[1]);
1759 assert(if_depth
< MAX_IF_DEPTH
);
1760 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1761 /* Note that brw_IF smashes the predicate_control field. */
1762 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1766 assert(if_depth
> 0);
1767 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1770 assert(if_depth
> 0);
1771 brw_ENDIF(p
, if_inst
[--if_depth
]);
1773 case OPCODE_BGNLOOP
:
1774 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1777 brw_set_predicate_control(p
, get_predicate(inst
));
1779 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1782 brw_set_predicate_control(p
, get_predicate(inst
));
1784 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1786 case OPCODE_ENDLOOP
:
1788 struct brw_instruction
*inst0
, *inst1
;
1793 if (intel
->gen
== 5)
1796 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1797 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1798 while (inst0
> loop_inst
[loop_depth
]) {
1800 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1801 inst0
->bits3
.if_else
.jump_count
== 0) {
1802 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1803 inst0
->bits3
.if_else
.pop_count
= 0;
1805 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1806 inst0
->bits3
.if_else
.jump_count
== 0) {
1807 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1808 inst0
->bits3
.if_else
.pop_count
= 0;
1814 brw_set_predicate_control(p
, get_predicate(inst
));
1815 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1816 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1819 brw_set_access_mode(p
, BRW_ALIGN_1
);
1820 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1821 brw_set_access_mode(p
, BRW_ALIGN_16
);
1822 brw_ADD(p
, get_addr_reg(stack_index
),
1823 get_addr_reg(stack_index
), brw_imm_d(4));
1824 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1825 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1828 brw_ADD(p
, get_addr_reg(stack_index
),
1829 get_addr_reg(stack_index
), brw_imm_d(-4));
1830 brw_set_access_mode(p
, BRW_ALIGN_1
);
1831 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1832 brw_set_access_mode(p
, BRW_ALIGN_16
);
1835 emit_vertex_write(c
);
1841 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1847 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1848 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1849 _mesa_opcode_string(inst
->Opcode
) :
1853 /* Set the predication update on the last instruction of the native
1854 * instruction sequence.
1856 * This would be problematic if it was set on a math instruction,
1857 * but that shouldn't be the case with the current GLSL compiler.
1859 if (inst
->CondUpdate
) {
1860 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1862 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1863 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1866 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1867 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1868 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1869 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1872 /* Result color clamping.
1874 * When destination register is an output register and
1875 * it's primary/secondary front/back color, we have to clamp
1876 * the result to [0,1]. This is done by enabling the
1877 * saturation bit for the last instruction.
1879 * We don't use brw_set_saturate() as it modifies
1880 * p->current->header.saturate, which affects all the subsequent
1881 * instructions. Instead, we directly modify the header
1882 * of the last (already stored) instruction.
1884 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1885 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1886 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1887 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1888 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1889 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1896 brw_resolve_cals(p
);
1900 if (INTEL_DEBUG
& DEBUG_VS
) {
1903 printf("vs-native:\n");
1904 for (i
= 0; i
< p
->nr_insn
; i
++)
1905 brw_disasm(stderr
, &p
->store
[i
], intel
->gen
);