2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
103 * Preallocate GRF register before code emit.
104 * Do things as simply as possible. Allocate and populate all regs
107 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
109 struct intel_context
*intel
= &c
->func
.brw
->intel
;
110 GLuint i
, reg
= 0, mrf
;
111 int attributes_in_vue
;
113 /* Determine whether to use a real constant buffer or use a block
114 * of GRF registers for constants. The later is faster but only
115 * works if everything fits in the GRF.
116 * XXX this heuristic/check may need some fine tuning...
118 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
119 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
120 c
->vp
->use_const_buffer
= GL_TRUE
;
122 c
->vp
->use_const_buffer
= GL_FALSE
;
124 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
126 /* r0 -- reserved as usual
128 c
->r0
= brw_vec8_grf(reg
, 0);
131 /* User clip planes from curbe:
133 if (c
->key
.nr_userclip
) {
134 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
135 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
138 /* Deal with curbe alignment:
140 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
143 /* Vertex program parameters from curbe:
145 if (c
->vp
->use_const_buffer
) {
146 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
149 /* We've got more constants than we can load with the push
150 * mechanism. This is often correlated with reladdr loads where
151 * we should probably be using a pull mechanism anyway to avoid
152 * excessive reading. However, the pull mechanism is slow in
153 * general. So, we try to allocate as many non-reladdr-loaded
154 * constants through the push buffer as we can before giving up.
156 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
158 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
160 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
163 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
164 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
165 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
166 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
167 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
168 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
169 inst
->SrcReg
[arg
].RelAddr
)
172 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
173 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
178 for (i
= 0; i
< constant
; i
++) {
179 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
183 reg
+= (constant
+ 1) / 2;
184 c
->prog_data
.curb_read_length
= reg
- 1;
185 /* XXX 0 causes a bug elsewhere... */
186 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
189 /* use a section of the GRF for constants */
190 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
191 for (i
= 0; i
< nr_params
; i
++) {
192 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
194 reg
+= (nr_params
+ 1) / 2;
195 c
->prog_data
.curb_read_length
= reg
- 1;
197 c
->prog_data
.nr_params
= nr_params
* 4;
200 /* Allocate input regs:
203 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
204 if (c
->prog_data
.inputs_read
& (1 << i
)) {
206 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
210 /* If there are no inputs, we'll still be reading one attribute's worth
211 * because it's required -- see urb_read_length setting.
213 if (c
->nr_inputs
== 0)
216 /* Allocate outputs. The non-position outputs go straight into message regs.
219 c
->first_output
= reg
;
220 c
->first_overflow_output
= 0;
224 else if (intel
->gen
== 5)
229 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
230 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
232 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
233 if (i
== VERT_RESULT_HPOS
) {
234 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
237 else if (i
== VERT_RESULT_PSIZ
) {
238 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
240 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
243 /* Two restrictions on our compute-to-MRF here. The
244 * message length for all SEND messages is restricted to
245 * [1,15], so we can't use mrf 15, as that means a length
248 * Additionally, URB writes are aligned to URB rows, so we
249 * need to put an even number of registers of URB data in
250 * each URB write so that the later write is aligned. A
251 * message length of 15 means 1 message header reg plus 14
254 * For attributes beyond the compute-to-MRF, we compute to
255 * GRFs and they will be written in the second URB_WRITE.
258 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
262 if (!c
->first_overflow_output
)
263 c
->first_overflow_output
= i
;
264 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
271 /* Allocate program temporaries:
273 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
274 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
278 /* Address reg(s). Don't try to use the internal address reg until
281 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
282 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
286 BRW_VERTICAL_STRIDE_8
,
288 BRW_HORIZONTAL_STRIDE_1
,
294 if (c
->vp
->use_const_buffer
) {
295 for (i
= 0; i
< 3; i
++) {
296 c
->current_const
[i
].index
= -1;
297 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
302 for (i
= 0; i
< 128; i
++) {
303 if (c
->output_regs
[i
].used_in_src
) {
304 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
309 if (c
->needs_stack
) {
310 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
314 /* Some opcodes need an internal temporary:
317 c
->last_tmp
= reg
; /* for allocation purposes */
319 /* Each input reg holds data from two vertices. The
320 * urb_read_length is the number of registers read from *each*
321 * vertex urb, so is half the amount:
323 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
324 /* Setting this field to 0 leads to undefined behavior according to the
325 * the VS_STATE docs. Our VUEs will always have at least one attribute
326 * sitting in them, even if it's padding.
328 if (c
->prog_data
.urb_read_length
== 0)
329 c
->prog_data
.urb_read_length
= 1;
331 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
332 * them to fit the biggest thing they need to.
334 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
336 /* See emit_vertex_write() for where the VUE's overhead on top of the
337 * attributes comes from.
340 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 7) / 8;
341 else if (intel
->gen
== 5)
342 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
344 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
346 c
->prog_data
.total_grf
= reg
;
348 if (INTEL_DEBUG
& DEBUG_VS
) {
349 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
350 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
351 printf("%s reg = %d\n", __FUNCTION__
, reg
);
357 * If an instruction uses a temp reg both as a src and the dest, we
358 * sometimes need to allocate an intermediate temporary.
360 static void unalias1( struct brw_vs_compile
*c
,
363 void (*func
)( struct brw_vs_compile
*,
367 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
368 struct brw_compile
*p
= &c
->func
;
369 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
371 brw_MOV(p
, dst
, tmp
);
381 * Checkes if 2-operand instruction needs an intermediate temporary.
383 static void unalias2( struct brw_vs_compile
*c
,
387 void (*func
)( struct brw_vs_compile
*,
392 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
393 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
394 struct brw_compile
*p
= &c
->func
;
395 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
396 func(c
, tmp
, arg0
, arg1
);
397 brw_MOV(p
, dst
, tmp
);
401 func(c
, dst
, arg0
, arg1
);
407 * Checkes if 3-operand instruction needs an intermediate temporary.
409 static void unalias3( struct brw_vs_compile
*c
,
414 void (*func
)( struct brw_vs_compile
*,
420 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
421 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
422 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
423 struct brw_compile
*p
= &c
->func
;
424 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
425 func(c
, tmp
, arg0
, arg1
, arg2
);
426 brw_MOV(p
, dst
, tmp
);
430 func(c
, dst
, arg0
, arg1
, arg2
);
434 static void emit_sop( struct brw_vs_compile
*c
,
440 struct brw_compile
*p
= &c
->func
;
442 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
443 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
444 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
445 brw_set_predicate_control_flag_value(p
, 0xff);
448 static void emit_seq( struct brw_vs_compile
*c
,
451 struct brw_reg arg1
)
453 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
456 static void emit_sne( struct brw_vs_compile
*c
,
459 struct brw_reg arg1
)
461 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
463 static void emit_slt( struct brw_vs_compile
*c
,
466 struct brw_reg arg1
)
468 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
471 static void emit_sle( struct brw_vs_compile
*c
,
474 struct brw_reg arg1
)
476 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
479 static void emit_sgt( struct brw_vs_compile
*c
,
482 struct brw_reg arg1
)
484 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
487 static void emit_sge( struct brw_vs_compile
*c
,
490 struct brw_reg arg1
)
492 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
495 static void emit_cmp( struct brw_compile
*p
,
499 struct brw_reg arg2
)
501 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
502 brw_SEL(p
, dst
, arg1
, arg2
);
503 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
506 static void emit_sign(struct brw_vs_compile
*c
,
510 struct brw_compile
*p
= &c
->func
;
512 brw_MOV(p
, dst
, brw_imm_f(0));
514 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
515 brw_MOV(p
, dst
, brw_imm_f(-1.0));
516 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
518 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
519 brw_MOV(p
, dst
, brw_imm_f(1.0));
520 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
523 static void emit_max( struct brw_compile
*p
,
526 struct brw_reg arg1
)
528 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
529 brw_SEL(p
, dst
, arg0
, arg1
);
530 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
533 static void emit_min( struct brw_compile
*p
,
536 struct brw_reg arg1
)
538 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
539 brw_SEL(p
, dst
, arg0
, arg1
);
540 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
544 static void emit_math1( struct brw_vs_compile
*c
,
550 /* There are various odd behaviours with SEND on the simulator. In
551 * addition there are documented issues with the fact that the GEN4
552 * processor doesn't do dependency control properly on SEND
553 * results. So, on balance, this kludge to get around failures
554 * with writemasked math results looks like it might be necessary
555 * whether that turns out to be a simulator bug or not:
557 struct brw_compile
*p
= &c
->func
;
558 struct intel_context
*intel
= &p
->brw
->intel
;
559 struct brw_reg tmp
= dst
;
560 GLboolean need_tmp
= (intel
->gen
< 6 &&
561 (dst
.dw1
.bits
.writemask
!= 0xf ||
562 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
570 BRW_MATH_SATURATE_NONE
,
573 BRW_MATH_DATA_SCALAR
,
577 brw_MOV(p
, dst
, tmp
);
583 static void emit_math2( struct brw_vs_compile
*c
,
590 struct brw_compile
*p
= &c
->func
;
591 struct intel_context
*intel
= &p
->brw
->intel
;
592 struct brw_reg tmp
= dst
;
593 GLboolean need_tmp
= (intel
->gen
< 6 &&
594 (dst
.dw1
.bits
.writemask
!= 0xf ||
595 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
600 brw_MOV(p
, brw_message_reg(3), arg1
);
605 BRW_MATH_SATURATE_NONE
,
608 BRW_MATH_DATA_SCALAR
,
612 brw_MOV(p
, dst
, tmp
);
618 static void emit_exp_noalias( struct brw_vs_compile
*c
,
620 struct brw_reg arg0
)
622 struct brw_compile
*p
= &c
->func
;
625 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
626 struct brw_reg tmp
= get_tmp(c
);
627 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
629 /* tmp_d = floor(arg0.x) */
630 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
632 /* result[0] = 2.0 ^ tmp */
634 /* Adjust exponent for floating point:
637 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
639 /* Install exponent and sign.
640 * Excess drops off the edge:
642 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
643 tmp_d
, brw_imm_d(23));
648 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
649 /* result[1] = arg0.x - floor(arg0.x) */
650 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
653 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
654 /* As with the LOG instruction, we might be better off just
655 * doing a taylor expansion here, seeing as we have to do all
658 * If mathbox partial precision is too low, consider also:
659 * result[3] = result[0] * EXP(result[1])
662 BRW_MATH_FUNCTION_EXP
,
663 brw_writemask(dst
, WRITEMASK_Z
),
664 brw_swizzle1(arg0
, 0),
665 BRW_MATH_PRECISION_FULL
);
668 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
669 /* result[3] = 1.0; */
670 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
675 static void emit_log_noalias( struct brw_vs_compile
*c
,
677 struct brw_reg arg0
)
679 struct brw_compile
*p
= &c
->func
;
680 struct brw_reg tmp
= dst
;
681 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
682 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
683 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
684 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
688 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
691 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
694 * These almost look likey they could be joined up, but not really
697 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
698 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
700 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
702 brw_writemask(tmp_ud
, WRITEMASK_X
),
703 brw_swizzle1(arg0_ud
, 0),
704 brw_imm_ud((1U<<31)-1));
707 brw_writemask(tmp_ud
, WRITEMASK_X
),
712 brw_writemask(tmp
, WRITEMASK_X
),
713 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
717 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
719 brw_writemask(tmp_ud
, WRITEMASK_Y
),
720 brw_swizzle1(arg0_ud
, 0),
721 brw_imm_ud((1<<23)-1));
724 brw_writemask(tmp_ud
, WRITEMASK_Y
),
726 brw_imm_ud(127<<23));
729 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
730 /* result[2] = result[0] + LOG2(result[1]); */
732 /* Why bother? The above is just a hint how to do this with a
733 * taylor series. Maybe we *should* use a taylor series as by
734 * the time all the above has been done it's almost certainly
735 * quicker than calling the mathbox, even with low precision.
738 * - result[0] + mathbox.LOG2(result[1])
739 * - mathbox.LOG2(arg0.x)
740 * - result[0] + inline_taylor_approx(result[1])
743 BRW_MATH_FUNCTION_LOG
,
744 brw_writemask(tmp
, WRITEMASK_Z
),
745 brw_swizzle1(tmp
, 1),
746 BRW_MATH_PRECISION_FULL
);
749 brw_writemask(tmp
, WRITEMASK_Z
),
750 brw_swizzle1(tmp
, 2),
751 brw_swizzle1(tmp
, 0));
754 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
755 /* result[3] = 1.0; */
756 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
760 brw_MOV(p
, dst
, tmp
);
766 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
768 static void emit_dst_noalias( struct brw_vs_compile
*c
,
773 struct brw_compile
*p
= &c
->func
;
775 /* There must be a better way to do this:
777 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
778 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
779 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
780 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
781 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
782 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
783 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
784 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
788 static void emit_xpd( struct brw_compile
*p
,
793 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
794 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
798 static void emit_lit_noalias( struct brw_vs_compile
*c
,
800 struct brw_reg arg0
)
802 struct brw_compile
*p
= &c
->func
;
803 struct brw_instruction
*if_insn
;
804 struct brw_reg tmp
= dst
;
805 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
810 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
811 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
813 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
814 * to get all channels active inside the IF. In the clipping code
815 * we run with NoMask, so it's not an option and we can use
816 * BRW_EXECUTE_1 for all comparisions.
818 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
819 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
821 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
823 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
824 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
825 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
828 BRW_MATH_FUNCTION_POW
,
829 brw_writemask(dst
, WRITEMASK_Z
),
830 brw_swizzle1(tmp
, 2),
831 brw_swizzle1(arg0
, 3),
832 BRW_MATH_PRECISION_PARTIAL
);
835 brw_ENDIF(p
, if_insn
);
840 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
846 struct brw_compile
*p
= &c
->func
;
848 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
849 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
850 brw_MAC(p
, dst
, arg0
, arg1
);
853 /** 3 or 4-component vector normalization */
854 static void emit_nrm( struct brw_vs_compile
*c
,
859 struct brw_compile
*p
= &c
->func
;
860 struct brw_reg tmp
= get_tmp(c
);
862 /* tmp = dot(arg0, arg0) */
864 brw_DP3(p
, tmp
, arg0
, arg0
);
866 brw_DP4(p
, tmp
, arg0
, arg0
);
868 /* tmp = 1 / sqrt(tmp) */
869 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
871 /* dst = arg0 * tmp */
872 brw_MUL(p
, dst
, arg0
, tmp
);
878 static struct brw_reg
879 get_constant(struct brw_vs_compile
*c
,
880 const struct prog_instruction
*inst
,
883 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
884 struct brw_compile
*p
= &c
->func
;
885 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
887 assert(argIndex
< 3);
889 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
890 /* Keep track of the last constant loaded in this slot, for reuse. */
891 c
->current_const
[argIndex
].index
= src
->Index
;
894 printf(" fetch const[%d] for arg %d into reg %d\n",
895 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
897 /* need to fetch the constant now */
899 const_reg
, /* writeback dest */
900 16 * src
->Index
, /* byte offset */
901 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
905 /* replicate lower four floats into upper half (to get XYZWXYZW) */
906 const_reg
= stride(const_reg
, 0, 4, 0);
912 static struct brw_reg
913 get_reladdr_constant(struct brw_vs_compile
*c
,
914 const struct prog_instruction
*inst
,
917 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
918 struct brw_compile
*p
= &c
->func
;
919 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
920 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
921 struct brw_reg byte_addr_reg
= get_tmp(c
);
923 assert(argIndex
< 3);
925 /* Can't reuse a reladdr constant load. */
926 c
->current_const
[argIndex
].index
= -1;
929 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
930 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
933 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
935 /* fetch the first vec4 */
936 brw_dp_READ_4_vs_relative(p
,
937 const_reg
, /* writeback dest */
938 byte_addr_reg
, /* address register */
939 16 * src
->Index
, /* byte offset */
940 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
948 /* TODO: relative addressing!
950 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
951 gl_register_file file
,
955 case PROGRAM_TEMPORARY
:
958 assert(c
->regs
[file
][index
].nr
!= 0);
959 return c
->regs
[file
][index
];
960 case PROGRAM_STATE_VAR
:
961 case PROGRAM_CONSTANT
:
962 case PROGRAM_UNIFORM
:
963 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
964 return c
->regs
[PROGRAM_STATE_VAR
][index
];
965 case PROGRAM_ADDRESS
:
967 return c
->regs
[file
][index
];
969 case PROGRAM_UNDEFINED
: /* undef values */
970 return brw_null_reg();
972 case PROGRAM_LOCAL_PARAM
:
973 case PROGRAM_ENV_PARAM
:
974 case PROGRAM_WRITE_ONLY
:
977 return brw_null_reg();
983 * Indirect addressing: get reg[[arg] + offset].
985 static struct brw_reg
deref( struct brw_vs_compile
*c
,
990 struct brw_compile
*p
= &c
->func
;
991 struct brw_reg tmp
= get_tmp(c
);
992 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
993 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
994 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
995 struct brw_reg indirect
= brw_vec4_indirect(0,0);
996 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
998 /* Set the vertical stride on the register access so that the first
999 * 4 components come from a0.0 and the second 4 from a0.1.
1001 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1004 brw_push_insn_state(p
);
1005 brw_set_access_mode(p
, BRW_ALIGN_1
);
1007 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1008 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1010 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1011 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1013 brw_MOV(p
, tmp
, indirect
);
1015 brw_pop_insn_state(p
);
1018 /* NOTE: tmp not released */
1023 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1024 const struct prog_instruction
*inst
,
1027 struct brw_compile
*p
= &c
->func
;
1029 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1030 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1031 struct brw_reg temp_base
= c
->regs
[inst
->DstReg
.File
][0];
1032 GLuint byte_offset
= temp_base
.nr
* 32 + temp_base
.subnr
;
1033 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1034 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1036 byte_offset
+= inst
->DstReg
.Index
* reg_size
;
1038 brw_push_insn_state(p
);
1039 brw_set_access_mode(p
, BRW_ALIGN_1
);
1041 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1042 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1043 brw_MOV(p
, indirect
, val
);
1045 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1046 brw_ADD(p
, brw_address_reg(0), acc
,
1047 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1048 brw_MOV(p
, indirect
, suboffset(val
, 4));
1050 brw_pop_insn_state(p
);
1054 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1055 * TODO: relative addressing!
1057 static struct brw_reg
1058 get_src_reg( struct brw_vs_compile
*c
,
1059 const struct prog_instruction
*inst
,
1062 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1063 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1064 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1066 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1067 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1069 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1073 return brw_imm_f(0.0f
);
1074 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1079 return brw_imm_f(-1.0F
);
1081 return brw_imm_f(1.0F
);
1082 } else if (src
->File
== PROGRAM_CONSTANT
) {
1083 const struct gl_program_parameter_list
*params
;
1087 switch (src
->Swizzle
) {
1102 if (component
>= 0) {
1103 params
= c
->vp
->program
.Base
.Parameters
;
1104 f
= params
->ParameterValues
[src
->Index
][component
];
1110 return brw_imm_f(f
);
1116 case PROGRAM_TEMPORARY
:
1118 case PROGRAM_OUTPUT
:
1120 return deref(c
, c
->regs
[file
][0], index
, 32);
1123 assert(c
->regs
[file
][index
].nr
!= 0);
1124 return c
->regs
[file
][index
];
1127 case PROGRAM_STATE_VAR
:
1128 case PROGRAM_CONSTANT
:
1129 case PROGRAM_UNIFORM
:
1130 case PROGRAM_ENV_PARAM
:
1131 case PROGRAM_LOCAL_PARAM
:
1132 if (c
->vp
->use_const_buffer
) {
1133 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1134 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1135 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1137 return get_reladdr_constant(c
, inst
, argIndex
);
1139 return get_constant(c
, inst
, argIndex
);
1142 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1145 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1146 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1148 case PROGRAM_ADDRESS
:
1150 return c
->regs
[file
][index
];
1152 case PROGRAM_UNDEFINED
:
1153 /* this is a normal case since we loop over all three src args */
1154 return brw_null_reg();
1156 case PROGRAM_WRITE_ONLY
:
1159 return brw_null_reg();
1164 * Return the brw reg for the given instruction's src argument.
1165 * Will return mangled results for SWZ op. The emit_swz() function
1166 * ignores this result and recalculates taking extended swizzles into
1169 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1170 const struct prog_instruction
*inst
,
1173 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1176 if (src
->File
== PROGRAM_UNDEFINED
)
1177 return brw_null_reg();
1179 reg
= get_src_reg(c
, inst
, argIndex
);
1181 /* Convert 3-bit swizzle to 2-bit.
1183 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1184 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1185 GET_SWZ(src
->Swizzle
, 1),
1186 GET_SWZ(src
->Swizzle
, 2),
1187 GET_SWZ(src
->Swizzle
, 3));
1190 /* Note this is ok for non-swizzle instructions:
1192 reg
.negate
= src
->Negate
? 1 : 0;
1199 * Get brw register for the given program dest register.
1201 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1202 struct prog_dst_register dst
)
1207 case PROGRAM_TEMPORARY
:
1208 case PROGRAM_OUTPUT
:
1209 /* register-indirect addressing is only 1x1, not VxH, for
1210 * destination regs. So, for RelAddr we'll return a temporary
1211 * for the dest and do a move of the result to the RelAddr
1212 * register after the instruction emit.
1217 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1218 reg
= c
->regs
[dst
.File
][dst
.Index
];
1221 case PROGRAM_ADDRESS
:
1222 assert(dst
.Index
== 0);
1223 reg
= c
->regs
[dst
.File
][dst
.Index
];
1225 case PROGRAM_UNDEFINED
:
1226 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1227 reg
= brw_null_reg();
1231 reg
= brw_null_reg();
1234 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1235 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1241 static void emit_swz( struct brw_vs_compile
*c
,
1243 const struct prog_instruction
*inst
)
1245 const GLuint argIndex
= 0;
1246 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1247 struct brw_compile
*p
= &c
->func
;
1248 GLuint zeros_mask
= 0;
1249 GLuint ones_mask
= 0;
1250 GLuint src_mask
= 0;
1252 GLboolean need_tmp
= (src
.Negate
&&
1253 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1254 struct brw_reg tmp
= dst
;
1260 for (i
= 0; i
< 4; i
++) {
1261 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1262 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1281 /* Do src first, in case dst aliases src:
1284 struct brw_reg arg0
;
1286 arg0
= get_src_reg(c
, inst
, argIndex
);
1288 arg0
= brw_swizzle(arg0
,
1289 src_swz
[0], src_swz
[1],
1290 src_swz
[2], src_swz
[3]);
1292 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1296 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1299 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1302 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1305 brw_MOV(p
, dst
, tmp
);
1306 release_tmp(c
, tmp
);
1312 * Post-vertex-program processing. Send the results to the URB.
1314 static void emit_vertex_write( struct brw_vs_compile
*c
)
1316 struct brw_compile
*p
= &c
->func
;
1317 struct brw_context
*brw
= p
->brw
;
1318 struct intel_context
*intel
= &brw
->intel
;
1319 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1322 GLuint len_vertex_header
= 2;
1324 if (c
->key
.copy_edgeflag
) {
1326 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1327 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1330 if (intel
->gen
< 6) {
1331 /* Build ndc coords */
1333 /* ndc = 1.0 / pos.w */
1334 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1335 /* ndc.xyz = pos * ndc */
1336 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1339 /* Update the header for point size, user clipping flags, and -ve rhw
1342 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1343 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
)
1345 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1348 brw_MOV(p
, header1
, brw_imm_ud(0));
1350 brw_set_access_mode(p
, BRW_ALIGN_16
);
1352 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1353 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1354 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1355 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1358 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1359 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1360 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1361 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1362 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1365 /* i965 clipping workaround:
1366 * 1) Test for -ve rhw
1368 * set ndc = (0,0,0,0)
1371 * Later, clipping will detect ucp[6] and ensure the primitive is
1372 * clipped against all fixed planes.
1374 if (brw
->has_negative_rhw_bug
) {
1376 vec8(brw_null_reg()),
1378 brw_swizzle1(ndc
, 3),
1381 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1382 brw_MOV(p
, ndc
, brw_imm_f(0));
1383 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1386 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1387 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1388 brw_set_access_mode(p
, BRW_ALIGN_16
);
1390 release_tmp(c
, header1
);
1393 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1396 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1397 * of zeros followed by two sets of NDC coordinates:
1399 brw_set_access_mode(p
, BRW_ALIGN_1
);
1401 /* The VUE layout is documented in Volume 2a. */
1402 if (intel
->gen
>= 6) {
1403 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1404 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1405 * dword 4-7 (m2) is the 4D space position
1406 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1407 * enabled. We don't use it, so skip it.
1408 * m3 is the first vertex element data we fill, which is the vertex
1411 brw_MOV(p
, brw_message_reg(2), pos
);
1412 brw_MOV(p
, brw_message_reg(3), pos
);
1413 len_vertex_header
= 2;
1414 } else if (intel
->gen
== 5) {
1415 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1416 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1417 * dword 4-7 (m2) is the ndc position (set above)
1418 * dword 8-11 (m3) of the vertex header is the 4D space position
1419 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1420 * m6 is a pad so that the vertex element data is aligned
1421 * m7 is the first vertex data we fill, which is the vertex position.
1423 brw_MOV(p
, brw_message_reg(2), ndc
);
1424 brw_MOV(p
, brw_message_reg(3), pos
);
1425 brw_MOV(p
, brw_message_reg(7), pos
);
1426 len_vertex_header
= 6;
1428 /* There are 8 dwords in VUE header pre-Ironlake:
1429 * dword 0-3 (m1) is indices, point width, clip flags.
1430 * dword 4-7 (m2) is ndc position (set above)
1432 * dword 8-11 (m3) is the first vertex data, which we always have be the
1435 brw_MOV(p
, brw_message_reg(2), ndc
);
1436 brw_MOV(p
, brw_message_reg(3), pos
);
1437 len_vertex_header
= 2;
1440 eot
= (c
->first_overflow_output
== 0);
1443 brw_null_reg(), /* dest */
1444 0, /* starting mrf reg nr */
1448 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1449 0, /* response len */
1451 eot
, /* writes complete */
1452 0, /* urb destination offset */
1453 BRW_URB_SWIZZLE_INTERLEAVE
);
1455 if (c
->first_overflow_output
> 0) {
1456 /* Not all of the vertex outputs/results fit into the MRF.
1457 * Move the overflowed attributes from the GRF to the MRF and
1458 * issue another brw_urb_WRITE().
1461 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1462 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1463 /* move from GRF to MRF */
1464 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1470 brw_null_reg(), /* dest */
1471 0, /* starting mrf reg nr */
1476 0, /* response len */
1478 1, /* writes complete */
1479 14 / 2, /* urb destination offset */
1480 BRW_URB_SWIZZLE_INTERLEAVE
);
1485 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1487 struct brw_compile
*p
= &c
->func
;
1488 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1490 if (p
->nr_insn
== 0)
1493 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1496 switch (prev_insn
->header
.opcode
) {
1497 case BRW_OPCODE_MOV
:
1498 case BRW_OPCODE_MAC
:
1499 case BRW_OPCODE_MUL
:
1500 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1501 prev_insn
->header
.execution_size
== val
.width
&&
1502 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1503 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1504 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1505 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1506 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1507 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1517 get_predicate(const struct prog_instruction
*inst
)
1519 if (inst
->DstReg
.CondMask
== COND_TR
)
1520 return BRW_PREDICATE_NONE
;
1522 /* All of GLSL only produces predicates for COND_NE and one channel per
1523 * vector. Fail badly if someone starts doing something else, as it might
1524 * mean infinite looping or something.
1526 * We'd like to support all the condition codes, but our hardware doesn't
1527 * quite match the Mesa IR, which is modeled after the NV extensions. For
1528 * those, the instruction may update the condition codes or not, then any
1529 * later instruction may use one of those condition codes. For gen4, the
1530 * instruction may update the flags register based on one of the condition
1531 * codes output by the instruction, and then further instructions may
1532 * predicate on that. We can probably support this, but it won't
1533 * necessarily be easy.
1535 assert(inst
->DstReg
.CondMask
== COND_NE
);
1537 switch (inst
->DstReg
.CondSwizzle
) {
1539 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1541 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1543 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1545 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1547 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1548 inst
->DstReg
.CondMask
);
1549 return BRW_PREDICATE_NORMAL
;
1553 /* Emit the vertex program instructions here.
1555 void brw_vs_emit(struct brw_vs_compile
*c
)
1557 #define MAX_IF_DEPTH 32
1558 #define MAX_LOOP_DEPTH 32
1559 struct brw_compile
*p
= &c
->func
;
1560 struct brw_context
*brw
= p
->brw
;
1561 struct intel_context
*intel
= &brw
->intel
;
1562 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1563 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1564 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1565 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1566 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1570 if (INTEL_DEBUG
& DEBUG_VS
) {
1571 printf("vs-mesa:\n");
1572 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1577 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1578 brw_set_access_mode(p
, BRW_ALIGN_16
);
1579 if_depth_in_loop
[loop_depth
] = 0;
1581 for (insn
= 0; insn
< nr_insns
; insn
++) {
1583 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1585 /* Message registers can't be read, so copy the output into GRF
1586 * register if they are used in source registers
1588 for (i
= 0; i
< 3; i
++) {
1589 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1590 GLuint index
= src
->Index
;
1591 GLuint file
= src
->File
;
1592 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1593 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1596 switch (inst
->Opcode
) {
1599 c
->needs_stack
= GL_TRUE
;
1606 /* Static register allocation
1608 brw_vs_alloc_regs(c
);
1611 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1613 for (insn
= 0; insn
< nr_insns
; insn
++) {
1615 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1616 struct brw_reg args
[3], dst
;
1618 struct brw_instruction
*temp
;
1621 printf("%d: ", insn
);
1622 _mesa_print_instruction(inst
);
1625 /* Get argument regs. SWZ is special and does this itself.
1627 if (inst
->Opcode
!= OPCODE_SWZ
)
1628 for (i
= 0; i
< 3; i
++) {
1629 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1632 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1633 args
[i
] = c
->output_regs
[index
].reg
;
1635 args
[i
] = get_arg(c
, inst
, i
);
1638 /* Get dest regs. Note that it is possible for a reg to be both
1639 * dst and arg, given the static allocation of registers. So
1640 * care needs to be taken emitting multi-operation instructions.
1642 index
= inst
->DstReg
.Index
;
1643 file
= inst
->DstReg
.File
;
1644 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1645 dst
= c
->output_regs
[index
].reg
;
1647 dst
= get_dst(c
, inst
->DstReg
);
1649 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1650 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1651 inst
->SaturateMode
);
1654 switch (inst
->Opcode
) {
1656 brw_MOV(p
, dst
, brw_abs(args
[0]));
1659 brw_ADD(p
, dst
, args
[0], args
[1]);
1662 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1665 brw_DP2(p
, dst
, args
[0], args
[1]);
1668 brw_DP3(p
, dst
, args
[0], args
[1]);
1671 brw_DP4(p
, dst
, args
[0], args
[1]);
1674 brw_DPH(p
, dst
, args
[0], args
[1]);
1677 emit_nrm(c
, dst
, args
[0], 3);
1680 emit_nrm(c
, dst
, args
[0], 4);
1683 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1686 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1689 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1692 brw_RNDD(p
, dst
, args
[0]);
1695 brw_RNDD(p
, dst
, args
[0]);
1698 brw_FRC(p
, dst
, args
[0]);
1701 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1704 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1707 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1710 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1713 if (!accumulator_contains(c
, args
[2]))
1714 brw_MOV(p
, brw_acc_reg(), args
[2]);
1715 brw_MAC(p
, dst
, args
[0], args
[1]);
1718 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1721 emit_max(p
, dst
, args
[0], args
[1]);
1724 emit_min(p
, dst
, args
[0], args
[1]);
1727 brw_MOV(p
, dst
, args
[0]);
1730 brw_MUL(p
, dst
, args
[0], args
[1]);
1733 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1736 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1739 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1743 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1746 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1749 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1752 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1755 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1758 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1761 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1764 unalias1(c
, dst
, args
[0], emit_sign
);
1767 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1770 /* The args[0] value can't be used here as it won't have
1771 * correctly encoded the full swizzle:
1773 emit_swz(c
, dst
, inst
);
1776 /* round toward zero */
1777 brw_RNDZ(p
, dst
, args
[0]);
1780 emit_xpd(p
, dst
, args
[0], args
[1]);
1783 assert(if_depth
< MAX_IF_DEPTH
);
1784 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1785 /* Note that brw_IF smashes the predicate_control field. */
1786 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1787 if_depth_in_loop
[loop_depth
]++;
1791 assert(if_depth
> 0);
1792 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1795 assert(if_depth
> 0);
1796 brw_ENDIF(p
, if_inst
[--if_depth
]);
1797 if_depth_in_loop
[loop_depth
]--;
1799 case OPCODE_BGNLOOP
:
1800 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1801 if_depth_in_loop
[loop_depth
] = 0;
1804 brw_set_predicate_control(p
, get_predicate(inst
));
1805 temp
= brw_BREAK(p
);
1806 temp
->bits3
.if_else
.pop_count
= if_depth_in_loop
[loop_depth
];
1807 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1810 brw_set_predicate_control(p
, get_predicate(inst
));
1812 temp
->bits3
.if_else
.pop_count
= if_depth_in_loop
[loop_depth
];
1813 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1815 case OPCODE_ENDLOOP
:
1817 struct brw_instruction
*inst0
, *inst1
;
1822 if (intel
->gen
== 5)
1825 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1826 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1827 while (inst0
> loop_inst
[loop_depth
]) {
1829 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1830 inst0
->bits3
.if_else
.jump_count
== 0) {
1831 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1833 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1834 inst0
->bits3
.if_else
.jump_count
== 0) {
1835 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1841 brw_set_predicate_control(p
, get_predicate(inst
));
1842 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1843 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1846 brw_set_access_mode(p
, BRW_ALIGN_1
);
1847 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1848 brw_set_access_mode(p
, BRW_ALIGN_16
);
1849 brw_ADD(p
, get_addr_reg(stack_index
),
1850 get_addr_reg(stack_index
), brw_imm_d(4));
1851 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1852 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1855 brw_ADD(p
, get_addr_reg(stack_index
),
1856 get_addr_reg(stack_index
), brw_imm_d(-4));
1857 brw_set_access_mode(p
, BRW_ALIGN_1
);
1858 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1859 brw_set_access_mode(p
, BRW_ALIGN_16
);
1862 emit_vertex_write(c
);
1868 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1874 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1875 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1876 _mesa_opcode_string(inst
->Opcode
) :
1880 /* Set the predication update on the last instruction of the native
1881 * instruction sequence.
1883 * This would be problematic if it was set on a math instruction,
1884 * but that shouldn't be the case with the current GLSL compiler.
1886 if (inst
->CondUpdate
) {
1887 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1889 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1890 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1893 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1894 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1895 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1896 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1899 /* Result color clamping.
1901 * When destination register is an output register and
1902 * it's primary/secondary front/back color, we have to clamp
1903 * the result to [0,1]. This is done by enabling the
1904 * saturation bit for the last instruction.
1906 * We don't use brw_set_saturate() as it modifies
1907 * p->current->header.saturate, which affects all the subsequent
1908 * instructions. Instead, we directly modify the header
1909 * of the last (already stored) instruction.
1911 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1912 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1913 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1914 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1915 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1916 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1920 if (inst
->DstReg
.RelAddr
&& inst
->DstReg
.File
== PROGRAM_TEMPORARY
) {
1921 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1922 * compute-to-mrf and the fact that we are allocating
1923 * registers for only the used PROGRAM_OUTPUTs.
1925 move_to_reladdr_dst(c
, inst
, dst
);
1931 brw_resolve_cals(p
);
1935 if (INTEL_DEBUG
& DEBUG_VS
) {
1938 printf("vs-native:\n");
1939 for (i
= 0; i
< p
->nr_insn
; i
++)
1940 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);