2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
65 /* These opcodes get broken down in a way that allow two
66 * args to be immediates.
68 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
69 if (arg
== 1 || arg
== 2)
73 if (opcode
> ARRAY_SIZE(opcode_array
))
76 return arg
== opcode_array
[opcode
] - 1;
79 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
81 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
83 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
84 c
->prog_data
.total_grf
= c
->last_tmp
;
89 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
91 if (tmp
.nr
== c
->last_tmp
-1)
95 static void release_tmps( struct brw_vs_compile
*c
)
97 c
->last_tmp
= c
->first_tmp
;
102 * Preallocate GRF register before code emit.
103 * Do things as simply as possible. Allocate and populate all regs
106 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
108 struct intel_context
*intel
= &c
->func
.brw
->intel
;
109 GLuint i
, reg
= 0, mrf
;
110 int attributes_in_vue
;
112 /* Determine whether to use a real constant buffer or use a block
113 * of GRF registers for constants. The later is faster but only
114 * works if everything fits in the GRF.
115 * XXX this heuristic/check may need some fine tuning...
117 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
118 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
119 c
->vp
->use_const_buffer
= GL_TRUE
;
121 c
->vp
->use_const_buffer
= GL_FALSE
;
123 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
125 /* r0 -- reserved as usual
127 c
->r0
= brw_vec8_grf(reg
, 0);
130 /* User clip planes from curbe:
132 if (c
->key
.nr_userclip
) {
133 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
134 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
137 /* Deal with curbe alignment:
139 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
142 /* Vertex program parameters from curbe:
144 if (c
->vp
->use_const_buffer
) {
145 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
148 /* We've got more constants than we can load with the push
149 * mechanism. This is often correlated with reladdr loads where
150 * we should probably be using a pull mechanism anyway to avoid
151 * excessive reading. However, the pull mechanism is slow in
152 * general. So, we try to allocate as many non-reladdr-loaded
153 * constants through the push buffer as we can before giving up.
155 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
157 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
159 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
162 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
163 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
164 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
165 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
166 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
167 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
168 inst
->SrcReg
[arg
].RelAddr
)
171 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
172 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
177 for (i
= 0; i
< constant
; i
++) {
178 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
182 reg
+= (constant
+ 1) / 2;
183 c
->prog_data
.curb_read_length
= reg
- 1;
184 /* XXX 0 causes a bug elsewhere... */
185 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
188 /* use a section of the GRF for constants */
189 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
190 for (i
= 0; i
< nr_params
; i
++) {
191 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
193 reg
+= (nr_params
+ 1) / 2;
194 c
->prog_data
.curb_read_length
= reg
- 1;
196 c
->prog_data
.nr_params
= nr_params
* 4;
199 /* Allocate input regs:
202 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
203 if (c
->prog_data
.inputs_read
& (1 << i
)) {
205 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
209 /* If there are no inputs, we'll still be reading one attribute's worth
210 * because it's required -- see urb_read_length setting.
212 if (c
->nr_inputs
== 0)
215 /* Allocate outputs. The non-position outputs go straight into message regs.
218 c
->first_output
= reg
;
219 c
->first_overflow_output
= 0;
223 else if (intel
->gen
== 5)
228 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
229 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
231 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
232 if (i
== VERT_RESULT_HPOS
) {
233 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
236 else if (i
== VERT_RESULT_PSIZ
) {
237 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
239 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
242 /* Two restrictions on our compute-to-MRF here. The
243 * message length for all SEND messages is restricted to
244 * [1,15], so we can't use mrf 15, as that means a length
247 * Additionally, URB writes are aligned to URB rows, so we
248 * need to put an even number of registers of URB data in
249 * each URB write so that the later write is aligned. A
250 * message length of 15 means 1 message header reg plus 14
253 * For attributes beyond the compute-to-MRF, we compute to
254 * GRFs and they will be written in the second URB_WRITE.
257 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
261 if (!c
->first_overflow_output
)
262 c
->first_overflow_output
= i
;
263 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
270 /* Allocate program temporaries:
272 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
273 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
277 /* Address reg(s). Don't try to use the internal address reg until
280 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
281 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
285 BRW_VERTICAL_STRIDE_8
,
287 BRW_HORIZONTAL_STRIDE_1
,
293 if (c
->vp
->use_const_buffer
) {
294 for (i
= 0; i
< 3; i
++) {
295 c
->current_const
[i
].index
= -1;
296 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
301 for (i
= 0; i
< 128; i
++) {
302 if (c
->output_regs
[i
].used_in_src
) {
303 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
308 if (c
->needs_stack
) {
309 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
313 /* Some opcodes need an internal temporary:
316 c
->last_tmp
= reg
; /* for allocation purposes */
318 /* Each input reg holds data from two vertices. The
319 * urb_read_length is the number of registers read from *each*
320 * vertex urb, so is half the amount:
322 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
323 /* Setting this field to 0 leads to undefined behavior according to the
324 * the VS_STATE docs. Our VUEs will always have at least one attribute
325 * sitting in them, even if it's padding.
327 if (c
->prog_data
.urb_read_length
== 0)
328 c
->prog_data
.urb_read_length
= 1;
330 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
331 * them to fit the biggest thing they need to.
333 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
335 /* See emit_vertex_write() for where the VUE's overhead on top of the
336 * attributes comes from.
339 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 7) / 8;
340 else if (intel
->gen
== 5)
341 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
343 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
345 c
->prog_data
.total_grf
= reg
;
347 if (INTEL_DEBUG
& DEBUG_VS
) {
348 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
349 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
350 printf("%s reg = %d\n", __FUNCTION__
, reg
);
356 * If an instruction uses a temp reg both as a src and the dest, we
357 * sometimes need to allocate an intermediate temporary.
359 static void unalias1( struct brw_vs_compile
*c
,
362 void (*func
)( struct brw_vs_compile
*,
366 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
367 struct brw_compile
*p
= &c
->func
;
368 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
370 brw_MOV(p
, dst
, tmp
);
380 * Checkes if 2-operand instruction needs an intermediate temporary.
382 static void unalias2( struct brw_vs_compile
*c
,
386 void (*func
)( struct brw_vs_compile
*,
391 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
392 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
393 struct brw_compile
*p
= &c
->func
;
394 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
395 func(c
, tmp
, arg0
, arg1
);
396 brw_MOV(p
, dst
, tmp
);
400 func(c
, dst
, arg0
, arg1
);
406 * Checkes if 3-operand instruction needs an intermediate temporary.
408 static void unalias3( struct brw_vs_compile
*c
,
413 void (*func
)( struct brw_vs_compile
*,
419 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
420 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
421 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
422 struct brw_compile
*p
= &c
->func
;
423 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
424 func(c
, tmp
, arg0
, arg1
, arg2
);
425 brw_MOV(p
, dst
, tmp
);
429 func(c
, dst
, arg0
, arg1
, arg2
);
433 static void emit_sop( struct brw_vs_compile
*c
,
439 struct brw_compile
*p
= &c
->func
;
441 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
442 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
443 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
444 brw_set_predicate_control_flag_value(p
, 0xff);
447 static void emit_seq( struct brw_vs_compile
*c
,
450 struct brw_reg arg1
)
452 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
455 static void emit_sne( struct brw_vs_compile
*c
,
458 struct brw_reg arg1
)
460 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
462 static void emit_slt( struct brw_vs_compile
*c
,
465 struct brw_reg arg1
)
467 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
470 static void emit_sle( struct brw_vs_compile
*c
,
473 struct brw_reg arg1
)
475 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
478 static void emit_sgt( struct brw_vs_compile
*c
,
481 struct brw_reg arg1
)
483 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
486 static void emit_sge( struct brw_vs_compile
*c
,
489 struct brw_reg arg1
)
491 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
494 static void emit_cmp( struct brw_compile
*p
,
498 struct brw_reg arg2
)
500 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
501 brw_SEL(p
, dst
, arg1
, arg2
);
502 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
505 static void emit_sign(struct brw_vs_compile
*c
,
509 struct brw_compile
*p
= &c
->func
;
511 brw_MOV(p
, dst
, brw_imm_f(0));
513 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
514 brw_MOV(p
, dst
, brw_imm_f(-1.0));
515 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
517 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
518 brw_MOV(p
, dst
, brw_imm_f(1.0));
519 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
522 static void emit_max( struct brw_compile
*p
,
525 struct brw_reg arg1
)
527 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
528 brw_SEL(p
, dst
, arg0
, arg1
);
529 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
532 static void emit_min( struct brw_compile
*p
,
535 struct brw_reg arg1
)
537 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
538 brw_SEL(p
, dst
, arg0
, arg1
);
539 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
543 static void emit_math1( struct brw_vs_compile
*c
,
549 /* There are various odd behaviours with SEND on the simulator. In
550 * addition there are documented issues with the fact that the GEN4
551 * processor doesn't do dependency control properly on SEND
552 * results. So, on balance, this kludge to get around failures
553 * with writemasked math results looks like it might be necessary
554 * whether that turns out to be a simulator bug or not:
556 struct brw_compile
*p
= &c
->func
;
557 struct intel_context
*intel
= &p
->brw
->intel
;
558 struct brw_reg tmp
= dst
;
559 GLboolean need_tmp
= (intel
->gen
< 6 &&
560 (dst
.dw1
.bits
.writemask
!= 0xf ||
561 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
569 BRW_MATH_SATURATE_NONE
,
572 BRW_MATH_DATA_SCALAR
,
576 brw_MOV(p
, dst
, tmp
);
582 static void emit_math2( struct brw_vs_compile
*c
,
589 struct brw_compile
*p
= &c
->func
;
590 struct intel_context
*intel
= &p
->brw
->intel
;
591 struct brw_reg tmp
= dst
;
592 GLboolean need_tmp
= (intel
->gen
< 6 &&
593 (dst
.dw1
.bits
.writemask
!= 0xf ||
594 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
599 brw_MOV(p
, brw_message_reg(3), arg1
);
604 BRW_MATH_SATURATE_NONE
,
607 BRW_MATH_DATA_SCALAR
,
611 brw_MOV(p
, dst
, tmp
);
617 static void emit_exp_noalias( struct brw_vs_compile
*c
,
619 struct brw_reg arg0
)
621 struct brw_compile
*p
= &c
->func
;
624 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
625 struct brw_reg tmp
= get_tmp(c
);
626 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
628 /* tmp_d = floor(arg0.x) */
629 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
631 /* result[0] = 2.0 ^ tmp */
633 /* Adjust exponent for floating point:
636 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
638 /* Install exponent and sign.
639 * Excess drops off the edge:
641 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
642 tmp_d
, brw_imm_d(23));
647 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
648 /* result[1] = arg0.x - floor(arg0.x) */
649 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
652 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
653 /* As with the LOG instruction, we might be better off just
654 * doing a taylor expansion here, seeing as we have to do all
657 * If mathbox partial precision is too low, consider also:
658 * result[3] = result[0] * EXP(result[1])
661 BRW_MATH_FUNCTION_EXP
,
662 brw_writemask(dst
, WRITEMASK_Z
),
663 brw_swizzle1(arg0
, 0),
664 BRW_MATH_PRECISION_FULL
);
667 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
668 /* result[3] = 1.0; */
669 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
674 static void emit_log_noalias( struct brw_vs_compile
*c
,
676 struct brw_reg arg0
)
678 struct brw_compile
*p
= &c
->func
;
679 struct brw_reg tmp
= dst
;
680 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
681 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
682 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
683 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
687 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
690 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
693 * These almost look likey they could be joined up, but not really
696 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
697 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
699 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
701 brw_writemask(tmp_ud
, WRITEMASK_X
),
702 brw_swizzle1(arg0_ud
, 0),
703 brw_imm_ud((1U<<31)-1));
706 brw_writemask(tmp_ud
, WRITEMASK_X
),
711 brw_writemask(tmp
, WRITEMASK_X
),
712 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
716 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
718 brw_writemask(tmp_ud
, WRITEMASK_Y
),
719 brw_swizzle1(arg0_ud
, 0),
720 brw_imm_ud((1<<23)-1));
723 brw_writemask(tmp_ud
, WRITEMASK_Y
),
725 brw_imm_ud(127<<23));
728 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
729 /* result[2] = result[0] + LOG2(result[1]); */
731 /* Why bother? The above is just a hint how to do this with a
732 * taylor series. Maybe we *should* use a taylor series as by
733 * the time all the above has been done it's almost certainly
734 * quicker than calling the mathbox, even with low precision.
737 * - result[0] + mathbox.LOG2(result[1])
738 * - mathbox.LOG2(arg0.x)
739 * - result[0] + inline_taylor_approx(result[1])
742 BRW_MATH_FUNCTION_LOG
,
743 brw_writemask(tmp
, WRITEMASK_Z
),
744 brw_swizzle1(tmp
, 1),
745 BRW_MATH_PRECISION_FULL
);
748 brw_writemask(tmp
, WRITEMASK_Z
),
749 brw_swizzle1(tmp
, 2),
750 brw_swizzle1(tmp
, 0));
753 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
754 /* result[3] = 1.0; */
755 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
759 brw_MOV(p
, dst
, tmp
);
765 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
767 static void emit_dst_noalias( struct brw_vs_compile
*c
,
772 struct brw_compile
*p
= &c
->func
;
774 /* There must be a better way to do this:
776 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
777 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
778 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
779 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
780 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
781 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
782 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
783 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
787 static void emit_xpd( struct brw_compile
*p
,
792 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
793 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
797 static void emit_lit_noalias( struct brw_vs_compile
*c
,
799 struct brw_reg arg0
)
801 struct brw_compile
*p
= &c
->func
;
802 struct brw_instruction
*if_insn
;
803 struct brw_reg tmp
= dst
;
804 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
809 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
810 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
812 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
813 * to get all channels active inside the IF. In the clipping code
814 * we run with NoMask, so it's not an option and we can use
815 * BRW_EXECUTE_1 for all comparisions.
817 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
818 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
820 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
822 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
823 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
824 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
827 BRW_MATH_FUNCTION_POW
,
828 brw_writemask(dst
, WRITEMASK_Z
),
829 brw_swizzle1(tmp
, 2),
830 brw_swizzle1(arg0
, 3),
831 BRW_MATH_PRECISION_PARTIAL
);
834 brw_ENDIF(p
, if_insn
);
839 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
845 struct brw_compile
*p
= &c
->func
;
847 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
848 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
849 brw_MAC(p
, dst
, arg0
, arg1
);
852 /** 3 or 4-component vector normalization */
853 static void emit_nrm( struct brw_vs_compile
*c
,
858 struct brw_compile
*p
= &c
->func
;
859 struct brw_reg tmp
= get_tmp(c
);
861 /* tmp = dot(arg0, arg0) */
863 brw_DP3(p
, tmp
, arg0
, arg0
);
865 brw_DP4(p
, tmp
, arg0
, arg0
);
867 /* tmp = 1 / sqrt(tmp) */
868 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
870 /* dst = arg0 * tmp */
871 brw_MUL(p
, dst
, arg0
, tmp
);
877 static struct brw_reg
878 get_constant(struct brw_vs_compile
*c
,
879 const struct prog_instruction
*inst
,
882 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
883 struct brw_compile
*p
= &c
->func
;
884 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
886 assert(argIndex
< 3);
888 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
889 /* Keep track of the last constant loaded in this slot, for reuse. */
890 c
->current_const
[argIndex
].index
= src
->Index
;
893 printf(" fetch const[%d] for arg %d into reg %d\n",
894 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
896 /* need to fetch the constant now */
898 const_reg
, /* writeback dest */
899 16 * src
->Index
, /* byte offset */
900 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
904 /* replicate lower four floats into upper half (to get XYZWXYZW) */
905 const_reg
= stride(const_reg
, 0, 4, 0);
911 static struct brw_reg
912 get_reladdr_constant(struct brw_vs_compile
*c
,
913 const struct prog_instruction
*inst
,
916 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
917 struct brw_compile
*p
= &c
->func
;
918 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
919 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
920 struct brw_reg byte_addr_reg
= get_tmp(c
);
922 assert(argIndex
< 3);
924 /* Can't reuse a reladdr constant load. */
925 c
->current_const
[argIndex
].index
= -1;
928 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
929 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
932 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
934 /* fetch the first vec4 */
935 brw_dp_READ_4_vs_relative(p
,
936 const_reg
, /* writeback dest */
937 byte_addr_reg
, /* address register */
938 16 * src
->Index
, /* byte offset */
939 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
947 /* TODO: relative addressing!
949 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
950 gl_register_file file
,
954 case PROGRAM_TEMPORARY
:
957 assert(c
->regs
[file
][index
].nr
!= 0);
958 return c
->regs
[file
][index
];
959 case PROGRAM_STATE_VAR
:
960 case PROGRAM_CONSTANT
:
961 case PROGRAM_UNIFORM
:
962 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
963 return c
->regs
[PROGRAM_STATE_VAR
][index
];
964 case PROGRAM_ADDRESS
:
966 return c
->regs
[file
][index
];
968 case PROGRAM_UNDEFINED
: /* undef values */
969 return brw_null_reg();
971 case PROGRAM_LOCAL_PARAM
:
972 case PROGRAM_ENV_PARAM
:
973 case PROGRAM_WRITE_ONLY
:
976 return brw_null_reg();
982 * Indirect addressing: get reg[[arg] + offset].
984 static struct brw_reg
deref( struct brw_vs_compile
*c
,
989 struct brw_compile
*p
= &c
->func
;
990 struct brw_reg tmp
= get_tmp(c
);
991 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
992 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
993 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
994 struct brw_reg indirect
= brw_vec4_indirect(0,0);
995 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
997 /* Set the vertical stride on the register access so that the first
998 * 4 components come from a0.0 and the second 4 from a0.1.
1000 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1003 brw_push_insn_state(p
);
1004 brw_set_access_mode(p
, BRW_ALIGN_1
);
1006 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1007 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1009 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1010 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1012 brw_MOV(p
, tmp
, indirect
);
1014 brw_pop_insn_state(p
);
1017 /* NOTE: tmp not released */
1022 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1023 const struct prog_instruction
*inst
,
1026 struct brw_compile
*p
= &c
->func
;
1028 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1029 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1030 struct brw_reg temp_base
= c
->regs
[inst
->DstReg
.File
][0];
1031 GLuint byte_offset
= temp_base
.nr
* 32 + temp_base
.subnr
;
1032 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1033 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1035 byte_offset
+= inst
->DstReg
.Index
* reg_size
;
1037 brw_push_insn_state(p
);
1038 brw_set_access_mode(p
, BRW_ALIGN_1
);
1040 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1041 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1042 brw_MOV(p
, indirect
, val
);
1044 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1045 brw_ADD(p
, brw_address_reg(0), acc
,
1046 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1047 brw_MOV(p
, indirect
, suboffset(val
, 4));
1049 brw_pop_insn_state(p
);
1053 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1054 * TODO: relative addressing!
1056 static struct brw_reg
1057 get_src_reg( struct brw_vs_compile
*c
,
1058 const struct prog_instruction
*inst
,
1061 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1062 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1063 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1065 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1066 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1068 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1072 return brw_imm_f(0.0f
);
1073 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1078 return brw_imm_f(-1.0F
);
1080 return brw_imm_f(1.0F
);
1081 } else if (src
->File
== PROGRAM_CONSTANT
) {
1082 const struct gl_program_parameter_list
*params
;
1086 switch (src
->Swizzle
) {
1101 if (component
>= 0) {
1102 params
= c
->vp
->program
.Base
.Parameters
;
1103 f
= params
->ParameterValues
[src
->Index
][component
];
1109 return brw_imm_f(f
);
1115 case PROGRAM_TEMPORARY
:
1117 case PROGRAM_OUTPUT
:
1119 return deref(c
, c
->regs
[file
][0], index
, 32);
1122 assert(c
->regs
[file
][index
].nr
!= 0);
1123 return c
->regs
[file
][index
];
1126 case PROGRAM_STATE_VAR
:
1127 case PROGRAM_CONSTANT
:
1128 case PROGRAM_UNIFORM
:
1129 case PROGRAM_ENV_PARAM
:
1130 case PROGRAM_LOCAL_PARAM
:
1131 if (c
->vp
->use_const_buffer
) {
1132 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1133 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1134 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1136 return get_reladdr_constant(c
, inst
, argIndex
);
1138 return get_constant(c
, inst
, argIndex
);
1141 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1144 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1145 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1147 case PROGRAM_ADDRESS
:
1149 return c
->regs
[file
][index
];
1151 case PROGRAM_UNDEFINED
:
1152 /* this is a normal case since we loop over all three src args */
1153 return brw_null_reg();
1155 case PROGRAM_WRITE_ONLY
:
1158 return brw_null_reg();
1163 * Return the brw reg for the given instruction's src argument.
1164 * Will return mangled results for SWZ op. The emit_swz() function
1165 * ignores this result and recalculates taking extended swizzles into
1168 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1169 const struct prog_instruction
*inst
,
1172 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1175 if (src
->File
== PROGRAM_UNDEFINED
)
1176 return brw_null_reg();
1178 reg
= get_src_reg(c
, inst
, argIndex
);
1180 /* Convert 3-bit swizzle to 2-bit.
1182 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1183 GET_SWZ(src
->Swizzle
, 1),
1184 GET_SWZ(src
->Swizzle
, 2),
1185 GET_SWZ(src
->Swizzle
, 3));
1187 /* Note this is ok for non-swizzle instructions:
1189 reg
.negate
= src
->Negate
? 1 : 0;
1196 * Get brw register for the given program dest register.
1198 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1199 struct prog_dst_register dst
)
1204 case PROGRAM_TEMPORARY
:
1205 case PROGRAM_OUTPUT
:
1206 /* register-indirect addressing is only 1x1, not VxH, for
1207 * destination regs. So, for RelAddr we'll return a temporary
1208 * for the dest and do a move of the result to the RelAddr
1209 * register after the instruction emit.
1214 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1215 reg
= c
->regs
[dst
.File
][dst
.Index
];
1218 case PROGRAM_ADDRESS
:
1219 assert(dst
.Index
== 0);
1220 reg
= c
->regs
[dst
.File
][dst
.Index
];
1222 case PROGRAM_UNDEFINED
:
1223 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1224 reg
= brw_null_reg();
1228 reg
= brw_null_reg();
1231 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1237 static void emit_swz( struct brw_vs_compile
*c
,
1239 const struct prog_instruction
*inst
)
1241 const GLuint argIndex
= 0;
1242 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1243 struct brw_compile
*p
= &c
->func
;
1244 GLuint zeros_mask
= 0;
1245 GLuint ones_mask
= 0;
1246 GLuint src_mask
= 0;
1248 GLboolean need_tmp
= (src
.Negate
&&
1249 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1250 struct brw_reg tmp
= dst
;
1256 for (i
= 0; i
< 4; i
++) {
1257 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1258 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1277 /* Do src first, in case dst aliases src:
1280 struct brw_reg arg0
;
1282 arg0
= get_src_reg(c
, inst
, argIndex
);
1284 arg0
= brw_swizzle(arg0
,
1285 src_swz
[0], src_swz
[1],
1286 src_swz
[2], src_swz
[3]);
1288 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1292 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1295 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1298 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1301 brw_MOV(p
, dst
, tmp
);
1302 release_tmp(c
, tmp
);
1308 * Post-vertex-program processing. Send the results to the URB.
1310 static void emit_vertex_write( struct brw_vs_compile
*c
)
1312 struct brw_compile
*p
= &c
->func
;
1313 struct brw_context
*brw
= p
->brw
;
1314 struct intel_context
*intel
= &brw
->intel
;
1315 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1318 GLuint len_vertex_header
= 2;
1320 if (c
->key
.copy_edgeflag
) {
1322 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1323 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1326 if (intel
->gen
< 6) {
1327 /* Build ndc coords */
1329 /* ndc = 1.0 / pos.w */
1330 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1331 /* ndc.xyz = pos * ndc */
1332 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1335 /* Update the header for point size, user clipping flags, and -ve rhw
1338 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1339 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
)
1341 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1344 brw_MOV(p
, header1
, brw_imm_ud(0));
1346 brw_set_access_mode(p
, BRW_ALIGN_16
);
1348 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1349 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1350 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1351 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1354 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1355 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1356 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1357 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1358 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1361 /* i965 clipping workaround:
1362 * 1) Test for -ve rhw
1364 * set ndc = (0,0,0,0)
1367 * Later, clipping will detect ucp[6] and ensure the primitive is
1368 * clipped against all fixed planes.
1370 if (brw
->has_negative_rhw_bug
) {
1372 vec8(brw_null_reg()),
1374 brw_swizzle1(ndc
, 3),
1377 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1378 brw_MOV(p
, ndc
, brw_imm_f(0));
1379 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1382 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1383 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1384 brw_set_access_mode(p
, BRW_ALIGN_16
);
1386 release_tmp(c
, header1
);
1389 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1392 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1393 * of zeros followed by two sets of NDC coordinates:
1395 brw_set_access_mode(p
, BRW_ALIGN_1
);
1397 /* The VUE layout is documented in Volume 2a. */
1398 if (intel
->gen
>= 6) {
1399 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1400 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1401 * dword 4-7 (m2) is the 4D space position
1402 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1403 * enabled. We don't use it, so skip it.
1404 * m3 is the first vertex element data we fill, which is the vertex
1407 brw_MOV(p
, brw_message_reg(2), pos
);
1408 brw_MOV(p
, brw_message_reg(3), pos
);
1409 len_vertex_header
= 2;
1410 } else if (intel
->gen
== 5) {
1411 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1412 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1413 * dword 4-7 (m2) is the ndc position (set above)
1414 * dword 8-11 (m3) of the vertex header is the 4D space position
1415 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1416 * m6 is a pad so that the vertex element data is aligned
1417 * m7 is the first vertex data we fill, which is the vertex position.
1419 brw_MOV(p
, brw_message_reg(2), ndc
);
1420 brw_MOV(p
, brw_message_reg(3), pos
);
1421 brw_MOV(p
, brw_message_reg(7), pos
);
1422 len_vertex_header
= 6;
1424 /* There are 8 dwords in VUE header pre-Ironlake:
1425 * dword 0-3 (m1) is indices, point width, clip flags.
1426 * dword 4-7 (m2) is ndc position (set above)
1428 * dword 8-11 (m3) is the first vertex data, which we always have be the
1431 brw_MOV(p
, brw_message_reg(2), ndc
);
1432 brw_MOV(p
, brw_message_reg(3), pos
);
1433 len_vertex_header
= 2;
1436 eot
= (c
->first_overflow_output
== 0);
1439 brw_null_reg(), /* dest */
1440 0, /* starting mrf reg nr */
1444 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1445 0, /* response len */
1447 eot
, /* writes complete */
1448 0, /* urb destination offset */
1449 BRW_URB_SWIZZLE_INTERLEAVE
);
1451 if (c
->first_overflow_output
> 0) {
1452 /* Not all of the vertex outputs/results fit into the MRF.
1453 * Move the overflowed attributes from the GRF to the MRF and
1454 * issue another brw_urb_WRITE().
1457 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1458 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1459 /* move from GRF to MRF */
1460 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1466 brw_null_reg(), /* dest */
1467 0, /* starting mrf reg nr */
1472 0, /* response len */
1474 1, /* writes complete */
1475 14 / 2, /* urb destination offset */
1476 BRW_URB_SWIZZLE_INTERLEAVE
);
1481 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1483 struct brw_compile
*p
= &c
->func
;
1484 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1486 if (p
->nr_insn
== 0)
1489 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1492 switch (prev_insn
->header
.opcode
) {
1493 case BRW_OPCODE_MOV
:
1494 case BRW_OPCODE_MAC
:
1495 case BRW_OPCODE_MUL
:
1496 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1497 prev_insn
->header
.execution_size
== val
.width
&&
1498 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1499 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1500 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1501 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1502 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1503 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1513 get_predicate(const struct prog_instruction
*inst
)
1515 if (inst
->DstReg
.CondMask
== COND_TR
)
1516 return BRW_PREDICATE_NONE
;
1518 /* All of GLSL only produces predicates for COND_NE and one channel per
1519 * vector. Fail badly if someone starts doing something else, as it might
1520 * mean infinite looping or something.
1522 * We'd like to support all the condition codes, but our hardware doesn't
1523 * quite match the Mesa IR, which is modeled after the NV extensions. For
1524 * those, the instruction may update the condition codes or not, then any
1525 * later instruction may use one of those condition codes. For gen4, the
1526 * instruction may update the flags register based on one of the condition
1527 * codes output by the instruction, and then further instructions may
1528 * predicate on that. We can probably support this, but it won't
1529 * necessarily be easy.
1531 assert(inst
->DstReg
.CondMask
== COND_NE
);
1533 switch (inst
->DstReg
.CondSwizzle
) {
1535 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1537 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1539 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1541 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1543 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1544 inst
->DstReg
.CondMask
);
1545 return BRW_PREDICATE_NORMAL
;
1549 /* Emit the vertex program instructions here.
1551 void brw_vs_emit(struct brw_vs_compile
*c
)
1553 #define MAX_IF_DEPTH 32
1554 #define MAX_LOOP_DEPTH 32
1555 struct brw_compile
*p
= &c
->func
;
1556 struct brw_context
*brw
= p
->brw
;
1557 struct intel_context
*intel
= &brw
->intel
;
1558 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1559 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1560 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1561 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1565 if (INTEL_DEBUG
& DEBUG_VS
) {
1566 printf("vs-mesa:\n");
1567 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1572 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1573 brw_set_access_mode(p
, BRW_ALIGN_16
);
1575 for (insn
= 0; insn
< nr_insns
; insn
++) {
1577 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1579 /* Message registers can't be read, so copy the output into GRF
1580 * register if they are used in source registers
1582 for (i
= 0; i
< 3; i
++) {
1583 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1584 GLuint index
= src
->Index
;
1585 GLuint file
= src
->File
;
1586 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1587 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1590 switch (inst
->Opcode
) {
1593 c
->needs_stack
= GL_TRUE
;
1600 /* Static register allocation
1602 brw_vs_alloc_regs(c
);
1605 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1607 for (insn
= 0; insn
< nr_insns
; insn
++) {
1609 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1610 struct brw_reg args
[3], dst
;
1614 printf("%d: ", insn
);
1615 _mesa_print_instruction(inst
);
1618 /* Get argument regs. SWZ is special and does this itself.
1620 if (inst
->Opcode
!= OPCODE_SWZ
)
1621 for (i
= 0; i
< 3; i
++) {
1622 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1625 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1626 args
[i
] = c
->output_regs
[index
].reg
;
1628 args
[i
] = get_arg(c
, inst
, i
);
1631 /* Get dest regs. Note that it is possible for a reg to be both
1632 * dst and arg, given the static allocation of registers. So
1633 * care needs to be taken emitting multi-operation instructions.
1635 index
= inst
->DstReg
.Index
;
1636 file
= inst
->DstReg
.File
;
1637 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1638 dst
= c
->output_regs
[index
].reg
;
1640 dst
= get_dst(c
, inst
->DstReg
);
1642 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1643 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1644 inst
->SaturateMode
);
1647 switch (inst
->Opcode
) {
1649 brw_MOV(p
, dst
, brw_abs(args
[0]));
1652 brw_ADD(p
, dst
, args
[0], args
[1]);
1655 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1658 brw_DP3(p
, dst
, args
[0], args
[1]);
1661 brw_DP4(p
, dst
, args
[0], args
[1]);
1664 brw_DPH(p
, dst
, args
[0], args
[1]);
1667 emit_nrm(c
, dst
, args
[0], 3);
1670 emit_nrm(c
, dst
, args
[0], 4);
1673 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1676 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1679 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1682 brw_RNDD(p
, dst
, args
[0]);
1685 brw_RNDD(p
, dst
, args
[0]);
1688 brw_FRC(p
, dst
, args
[0]);
1691 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1694 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1697 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1700 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1703 if (!accumulator_contains(c
, args
[2]))
1704 brw_MOV(p
, brw_acc_reg(), args
[2]);
1705 brw_MAC(p
, dst
, args
[0], args
[1]);
1708 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1711 emit_max(p
, dst
, args
[0], args
[1]);
1714 emit_min(p
, dst
, args
[0], args
[1]);
1717 brw_MOV(p
, dst
, args
[0]);
1720 brw_MUL(p
, dst
, args
[0], args
[1]);
1723 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1726 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1729 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1733 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1736 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1739 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1742 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1745 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1748 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1751 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1754 unalias1(c
, dst
, args
[0], emit_sign
);
1757 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1760 /* The args[0] value can't be used here as it won't have
1761 * correctly encoded the full swizzle:
1763 emit_swz(c
, dst
, inst
);
1766 /* round toward zero */
1767 brw_RNDZ(p
, dst
, args
[0]);
1770 emit_xpd(p
, dst
, args
[0], args
[1]);
1773 assert(if_depth
< MAX_IF_DEPTH
);
1774 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1775 /* Note that brw_IF smashes the predicate_control field. */
1776 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1780 assert(if_depth
> 0);
1781 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1784 assert(if_depth
> 0);
1785 brw_ENDIF(p
, if_inst
[--if_depth
]);
1787 case OPCODE_BGNLOOP
:
1788 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1791 brw_set_predicate_control(p
, get_predicate(inst
));
1793 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1796 brw_set_predicate_control(p
, get_predicate(inst
));
1798 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1800 case OPCODE_ENDLOOP
:
1802 struct brw_instruction
*inst0
, *inst1
;
1807 if (intel
->gen
== 5)
1810 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1811 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1812 while (inst0
> loop_inst
[loop_depth
]) {
1814 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1815 inst0
->bits3
.if_else
.jump_count
== 0) {
1816 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1817 inst0
->bits3
.if_else
.pop_count
= 0;
1819 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1820 inst0
->bits3
.if_else
.jump_count
== 0) {
1821 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1822 inst0
->bits3
.if_else
.pop_count
= 0;
1828 brw_set_predicate_control(p
, get_predicate(inst
));
1829 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1830 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1833 brw_set_access_mode(p
, BRW_ALIGN_1
);
1834 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1835 brw_set_access_mode(p
, BRW_ALIGN_16
);
1836 brw_ADD(p
, get_addr_reg(stack_index
),
1837 get_addr_reg(stack_index
), brw_imm_d(4));
1838 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1839 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1842 brw_ADD(p
, get_addr_reg(stack_index
),
1843 get_addr_reg(stack_index
), brw_imm_d(-4));
1844 brw_set_access_mode(p
, BRW_ALIGN_1
);
1845 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1846 brw_set_access_mode(p
, BRW_ALIGN_16
);
1849 emit_vertex_write(c
);
1855 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1861 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1862 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1863 _mesa_opcode_string(inst
->Opcode
) :
1867 /* Set the predication update on the last instruction of the native
1868 * instruction sequence.
1870 * This would be problematic if it was set on a math instruction,
1871 * but that shouldn't be the case with the current GLSL compiler.
1873 if (inst
->CondUpdate
) {
1874 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1876 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1877 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1880 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1881 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1882 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1883 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1886 /* Result color clamping.
1888 * When destination register is an output register and
1889 * it's primary/secondary front/back color, we have to clamp
1890 * the result to [0,1]. This is done by enabling the
1891 * saturation bit for the last instruction.
1893 * We don't use brw_set_saturate() as it modifies
1894 * p->current->header.saturate, which affects all the subsequent
1895 * instructions. Instead, we directly modify the header
1896 * of the last (already stored) instruction.
1898 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1899 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1900 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1901 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1902 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1903 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1907 if (inst
->DstReg
.RelAddr
&& inst
->DstReg
.File
== PROGRAM_TEMPORARY
) {
1908 /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the
1909 * compute-to-mrf and the fact that we are allocating
1910 * registers for only the used PROGRAM_OUTPUTs.
1912 move_to_reladdr_dst(c
, inst
, dst
);
1918 brw_resolve_cals(p
);
1922 if (INTEL_DEBUG
& DEBUG_VS
) {
1925 printf("vs-native:\n");
1926 for (i
= 0; i
< p
->nr_insn
; i
++)
1927 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);