2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
152 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
153 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
154 c
->vp
->use_const_buffer
= GL_TRUE
;
156 c
->vp
->use_const_buffer
= GL_FALSE
;
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
160 /* r0 -- reserved as usual
162 c
->r0
= brw_vec8_grf(reg
, 0);
165 /* User clip planes from curbe:
167 if (c
->key
.nr_userclip
) {
168 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
169 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
172 /* Deal with curbe alignment:
174 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
177 /* Vertex program parameters from curbe:
179 if (c
->vp
->use_const_buffer
) {
180 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
183 /* We've got more constants than we can load with the push
184 * mechanism. This is often correlated with reladdr loads where
185 * we should probably be using a pull mechanism anyway to avoid
186 * excessive reading. However, the pull mechanism is slow in
187 * general. So, we try to allocate as many non-reladdr-loaded
188 * constants through the push buffer as we can before giving up.
190 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
192 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
194 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
197 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
198 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
199 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
200 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
201 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
202 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
203 inst
->SrcReg
[arg
].RelAddr
)
206 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
207 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
212 for (i
= 0; i
< constant
; i
++) {
213 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
217 reg
+= (constant
+ 1) / 2;
218 c
->prog_data
.curb_read_length
= reg
- 1;
219 /* XXX 0 causes a bug elsewhere... */
220 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
223 /* use a section of the GRF for constants */
224 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
225 for (i
= 0; i
< nr_params
; i
++) {
226 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
228 reg
+= (nr_params
+ 1) / 2;
229 c
->prog_data
.curb_read_length
= reg
- 1;
231 c
->prog_data
.nr_params
= nr_params
* 4;
234 /* Allocate input regs:
237 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
238 if (c
->prog_data
.inputs_read
& (1 << i
)) {
240 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
244 /* If there are no inputs, we'll still be reading one attribute's worth
245 * because it's required -- see urb_read_length setting.
247 if (c
->nr_inputs
== 0)
250 /* Allocate outputs. The non-position outputs go straight into message regs.
253 c
->first_output
= reg
;
254 c
->first_overflow_output
= 0;
257 mrf
= 3; /* no more pos store in attribute */
258 else if (intel
->gen
== 5)
263 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
264 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
265 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
267 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
268 if (i
== VERT_RESULT_HPOS
) {
269 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
272 else if (i
== VERT_RESULT_PSIZ
) {
273 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
275 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
278 /* Two restrictions on our compute-to-MRF here. The
279 * message length for all SEND messages is restricted to
280 * [1,15], so we can't use mrf 15, as that means a length
283 * Additionally, URB writes are aligned to URB rows, so we
284 * need to put an even number of registers of URB data in
285 * each URB write so that the later write is aligned. A
286 * message length of 15 means 1 message header reg plus 14
289 * For attributes beyond the compute-to-MRF, we compute to
290 * GRFs and they will be written in the second URB_WRITE.
292 if (first_reladdr_output
> i
&& mrf
< 15) {
293 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
297 if (mrf
>= 15 && !c
->first_overflow_output
)
298 c
->first_overflow_output
= i
;
299 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
307 /* Allocate program temporaries:
309 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
310 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
314 /* Address reg(s). Don't try to use the internal address reg until
317 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
318 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
322 BRW_VERTICAL_STRIDE_8
,
324 BRW_HORIZONTAL_STRIDE_1
,
330 if (c
->vp
->use_const_buffer
) {
331 for (i
= 0; i
< 3; i
++) {
332 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
335 clear_current_const(c
);
338 for (i
= 0; i
< 128; i
++) {
339 if (c
->output_regs
[i
].used_in_src
) {
340 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
345 if (c
->needs_stack
) {
346 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
350 /* Some opcodes need an internal temporary:
353 c
->last_tmp
= reg
; /* for allocation purposes */
355 /* Each input reg holds data from two vertices. The
356 * urb_read_length is the number of registers read from *each*
357 * vertex urb, so is half the amount:
359 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
360 /* Setting this field to 0 leads to undefined behavior according to the
361 * the VS_STATE docs. Our VUEs will always have at least one attribute
362 * sitting in them, even if it's padding.
364 if (c
->prog_data
.urb_read_length
== 0)
365 c
->prog_data
.urb_read_length
= 1;
367 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
368 * them to fit the biggest thing they need to.
370 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
372 /* See emit_vertex_write() for where the VUE's overhead on top of the
373 * attributes comes from.
376 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 7) / 8;
377 else if (intel
->gen
== 5)
378 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
380 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
382 c
->prog_data
.total_grf
= reg
;
384 if (INTEL_DEBUG
& DEBUG_VS
) {
385 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
386 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
387 printf("%s reg = %d\n", __FUNCTION__
, reg
);
393 * If an instruction uses a temp reg both as a src and the dest, we
394 * sometimes need to allocate an intermediate temporary.
396 static void unalias1( struct brw_vs_compile
*c
,
399 void (*func
)( struct brw_vs_compile
*,
403 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
404 struct brw_compile
*p
= &c
->func
;
405 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
407 brw_MOV(p
, dst
, tmp
);
417 * Checkes if 2-operand instruction needs an intermediate temporary.
419 static void unalias2( struct brw_vs_compile
*c
,
423 void (*func
)( struct brw_vs_compile
*,
428 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
429 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
430 struct brw_compile
*p
= &c
->func
;
431 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
432 func(c
, tmp
, arg0
, arg1
);
433 brw_MOV(p
, dst
, tmp
);
437 func(c
, dst
, arg0
, arg1
);
443 * Checkes if 3-operand instruction needs an intermediate temporary.
445 static void unalias3( struct brw_vs_compile
*c
,
450 void (*func
)( struct brw_vs_compile
*,
456 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
457 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
458 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
459 struct brw_compile
*p
= &c
->func
;
460 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
461 func(c
, tmp
, arg0
, arg1
, arg2
);
462 brw_MOV(p
, dst
, tmp
);
466 func(c
, dst
, arg0
, arg1
, arg2
);
470 static void emit_sop( struct brw_vs_compile
*c
,
476 struct brw_compile
*p
= &c
->func
;
478 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
479 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
480 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
481 brw_set_predicate_control_flag_value(p
, 0xff);
484 static void emit_seq( struct brw_vs_compile
*c
,
487 struct brw_reg arg1
)
489 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
492 static void emit_sne( struct brw_vs_compile
*c
,
495 struct brw_reg arg1
)
497 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
499 static void emit_slt( struct brw_vs_compile
*c
,
502 struct brw_reg arg1
)
504 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
507 static void emit_sle( struct brw_vs_compile
*c
,
510 struct brw_reg arg1
)
512 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
515 static void emit_sgt( struct brw_vs_compile
*c
,
518 struct brw_reg arg1
)
520 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
523 static void emit_sge( struct brw_vs_compile
*c
,
526 struct brw_reg arg1
)
528 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
531 static void emit_cmp( struct brw_compile
*p
,
535 struct brw_reg arg2
)
537 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
538 brw_SEL(p
, dst
, arg1
, arg2
);
539 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
542 static void emit_sign(struct brw_vs_compile
*c
,
546 struct brw_compile
*p
= &c
->func
;
548 brw_MOV(p
, dst
, brw_imm_f(0));
550 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
551 brw_MOV(p
, dst
, brw_imm_f(-1.0));
552 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
554 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
555 brw_MOV(p
, dst
, brw_imm_f(1.0));
556 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
559 static void emit_max( struct brw_compile
*p
,
562 struct brw_reg arg1
)
564 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
565 brw_SEL(p
, dst
, arg0
, arg1
);
566 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
569 static void emit_min( struct brw_compile
*p
,
572 struct brw_reg arg1
)
574 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
575 brw_SEL(p
, dst
, arg0
, arg1
);
576 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
580 static void emit_math1( struct brw_vs_compile
*c
,
586 /* There are various odd behaviours with SEND on the simulator. In
587 * addition there are documented issues with the fact that the GEN4
588 * processor doesn't do dependency control properly on SEND
589 * results. So, on balance, this kludge to get around failures
590 * with writemasked math results looks like it might be necessary
591 * whether that turns out to be a simulator bug or not:
593 struct brw_compile
*p
= &c
->func
;
594 struct intel_context
*intel
= &p
->brw
->intel
;
595 struct brw_reg tmp
= dst
;
596 GLboolean need_tmp
= GL_FALSE
;
598 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
)
601 if (intel
->gen
< 6 && dst
.dw1
.bits
.writemask
!= 0xf)
610 BRW_MATH_SATURATE_NONE
,
613 BRW_MATH_DATA_SCALAR
,
617 brw_MOV(p
, dst
, tmp
);
623 static void emit_math2( struct brw_vs_compile
*c
,
630 struct brw_compile
*p
= &c
->func
;
631 struct intel_context
*intel
= &p
->brw
->intel
;
632 struct brw_reg tmp
= dst
;
633 GLboolean need_tmp
= GL_FALSE
;
635 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
)
638 if (intel
->gen
< 6 && dst
.dw1
.bits
.writemask
!= 0xf)
644 brw_MOV(p
, brw_message_reg(3), arg1
);
649 BRW_MATH_SATURATE_NONE
,
652 BRW_MATH_DATA_SCALAR
,
656 brw_MOV(p
, dst
, tmp
);
662 static void emit_exp_noalias( struct brw_vs_compile
*c
,
664 struct brw_reg arg0
)
666 struct brw_compile
*p
= &c
->func
;
669 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
670 struct brw_reg tmp
= get_tmp(c
);
671 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
673 /* tmp_d = floor(arg0.x) */
674 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
676 /* result[0] = 2.0 ^ tmp */
678 /* Adjust exponent for floating point:
681 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
683 /* Install exponent and sign.
684 * Excess drops off the edge:
686 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
687 tmp_d
, brw_imm_d(23));
692 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
693 /* result[1] = arg0.x - floor(arg0.x) */
694 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
697 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
698 /* As with the LOG instruction, we might be better off just
699 * doing a taylor expansion here, seeing as we have to do all
702 * If mathbox partial precision is too low, consider also:
703 * result[3] = result[0] * EXP(result[1])
706 BRW_MATH_FUNCTION_EXP
,
707 brw_writemask(dst
, WRITEMASK_Z
),
708 brw_swizzle1(arg0
, 0),
709 BRW_MATH_PRECISION_FULL
);
712 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
713 /* result[3] = 1.0; */
714 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
719 static void emit_log_noalias( struct brw_vs_compile
*c
,
721 struct brw_reg arg0
)
723 struct brw_compile
*p
= &c
->func
;
724 struct brw_reg tmp
= dst
;
725 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
726 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
727 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
728 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
732 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
735 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
738 * These almost look likey they could be joined up, but not really
741 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
742 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
744 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
746 brw_writemask(tmp_ud
, WRITEMASK_X
),
747 brw_swizzle1(arg0_ud
, 0),
748 brw_imm_ud((1U<<31)-1));
751 brw_writemask(tmp_ud
, WRITEMASK_X
),
756 brw_writemask(tmp
, WRITEMASK_X
),
757 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
761 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
763 brw_writemask(tmp_ud
, WRITEMASK_Y
),
764 brw_swizzle1(arg0_ud
, 0),
765 brw_imm_ud((1<<23)-1));
768 brw_writemask(tmp_ud
, WRITEMASK_Y
),
770 brw_imm_ud(127<<23));
773 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
774 /* result[2] = result[0] + LOG2(result[1]); */
776 /* Why bother? The above is just a hint how to do this with a
777 * taylor series. Maybe we *should* use a taylor series as by
778 * the time all the above has been done it's almost certainly
779 * quicker than calling the mathbox, even with low precision.
782 * - result[0] + mathbox.LOG2(result[1])
783 * - mathbox.LOG2(arg0.x)
784 * - result[0] + inline_taylor_approx(result[1])
787 BRW_MATH_FUNCTION_LOG
,
788 brw_writemask(tmp
, WRITEMASK_Z
),
789 brw_swizzle1(tmp
, 1),
790 BRW_MATH_PRECISION_FULL
);
793 brw_writemask(tmp
, WRITEMASK_Z
),
794 brw_swizzle1(tmp
, 2),
795 brw_swizzle1(tmp
, 0));
798 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
799 /* result[3] = 1.0; */
800 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
804 brw_MOV(p
, dst
, tmp
);
810 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
812 static void emit_dst_noalias( struct brw_vs_compile
*c
,
817 struct brw_compile
*p
= &c
->func
;
819 /* There must be a better way to do this:
821 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
822 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
823 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
824 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
825 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
826 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
827 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
828 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
832 static void emit_xpd( struct brw_compile
*p
,
837 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
838 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
842 static void emit_lit_noalias( struct brw_vs_compile
*c
,
844 struct brw_reg arg0
)
846 struct brw_compile
*p
= &c
->func
;
847 struct brw_instruction
*if_insn
;
848 struct brw_reg tmp
= dst
;
849 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
854 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
855 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
857 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
858 * to get all channels active inside the IF. In the clipping code
859 * we run with NoMask, so it's not an option and we can use
860 * BRW_EXECUTE_1 for all comparisions.
862 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
863 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
865 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
867 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
868 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
869 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
872 BRW_MATH_FUNCTION_POW
,
873 brw_writemask(dst
, WRITEMASK_Z
),
874 brw_swizzle1(tmp
, 2),
875 brw_swizzle1(arg0
, 3),
876 BRW_MATH_PRECISION_PARTIAL
);
879 brw_ENDIF(p
, if_insn
);
884 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
890 struct brw_compile
*p
= &c
->func
;
892 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
893 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
894 brw_MAC(p
, dst
, arg0
, arg1
);
897 /** 3 or 4-component vector normalization */
898 static void emit_nrm( struct brw_vs_compile
*c
,
903 struct brw_compile
*p
= &c
->func
;
904 struct brw_reg tmp
= get_tmp(c
);
906 /* tmp = dot(arg0, arg0) */
908 brw_DP3(p
, tmp
, arg0
, arg0
);
910 brw_DP4(p
, tmp
, arg0
, arg0
);
912 /* tmp = 1 / sqrt(tmp) */
913 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
915 /* dst = arg0 * tmp */
916 brw_MUL(p
, dst
, arg0
, tmp
);
922 static struct brw_reg
923 get_constant(struct brw_vs_compile
*c
,
924 const struct prog_instruction
*inst
,
927 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
928 struct brw_compile
*p
= &c
->func
;
929 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
931 assert(argIndex
< 3);
933 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
935 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
936 /* Keep track of the last constant loaded in this slot, for reuse. */
937 c
->current_const
[argIndex
].index
= src
->Index
;
940 printf(" fetch const[%d] for arg %d into reg %d\n",
941 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
943 /* need to fetch the constant now */
945 const_reg
, /* writeback dest */
946 16 * src
->Index
, /* byte offset */
947 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
951 /* replicate lower four floats into upper half (to get XYZWXYZW) */
952 const_reg
= stride(const_reg
, 0, 4, 0);
958 static struct brw_reg
959 get_reladdr_constant(struct brw_vs_compile
*c
,
960 const struct prog_instruction
*inst
,
963 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
964 struct brw_compile
*p
= &c
->func
;
965 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
966 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
967 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
969 assert(argIndex
< 3);
971 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
973 /* Can't reuse a reladdr constant load. */
974 c
->current_const
[argIndex
].index
= -1;
977 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
978 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
981 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
983 /* fetch the first vec4 */
984 brw_dp_READ_4_vs_relative(p
,
985 const_reg
, /* writeback dest */
986 byte_addr_reg
, /* address register */
987 16 * src
->Index
, /* byte offset */
988 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
996 /* TODO: relative addressing!
998 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
999 gl_register_file file
,
1003 case PROGRAM_TEMPORARY
:
1005 case PROGRAM_OUTPUT
:
1006 assert(c
->regs
[file
][index
].nr
!= 0);
1007 return c
->regs
[file
][index
];
1008 case PROGRAM_STATE_VAR
:
1009 case PROGRAM_CONSTANT
:
1010 case PROGRAM_UNIFORM
:
1011 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1012 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1013 case PROGRAM_ADDRESS
:
1015 return c
->regs
[file
][index
];
1017 case PROGRAM_UNDEFINED
: /* undef values */
1018 return brw_null_reg();
1020 case PROGRAM_LOCAL_PARAM
:
1021 case PROGRAM_ENV_PARAM
:
1022 case PROGRAM_WRITE_ONLY
:
1025 return brw_null_reg();
1031 * Indirect addressing: get reg[[arg] + offset].
1033 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1038 struct brw_compile
*p
= &c
->func
;
1039 struct brw_reg tmp
= get_tmp(c
);
1040 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1041 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1042 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1043 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1044 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1046 /* Set the vertical stride on the register access so that the first
1047 * 4 components come from a0.0 and the second 4 from a0.1.
1049 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1052 brw_push_insn_state(p
);
1053 brw_set_access_mode(p
, BRW_ALIGN_1
);
1055 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1056 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1058 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1059 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1061 brw_MOV(p
, tmp
, indirect
);
1063 brw_pop_insn_state(p
);
1066 /* NOTE: tmp not released */
1071 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1072 const struct prog_instruction
*inst
,
1075 struct brw_compile
*p
= &c
->func
;
1077 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1078 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1079 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1080 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1081 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1082 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1084 /* Because destination register indirect addressing can only use
1085 * one index, we'll write each vertex's vec4 value separately.
1087 val
.width
= BRW_WIDTH_4
;
1088 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1090 brw_push_insn_state(p
);
1091 brw_set_access_mode(p
, BRW_ALIGN_1
);
1093 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1094 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1095 brw_MOV(p
, indirect
, val
);
1097 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1098 brw_ADD(p
, brw_address_reg(0), acc
,
1099 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1100 brw_MOV(p
, indirect
, suboffset(val
, 4));
1102 brw_pop_insn_state(p
);
1106 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1107 * TODO: relative addressing!
1109 static struct brw_reg
1110 get_src_reg( struct brw_vs_compile
*c
,
1111 const struct prog_instruction
*inst
,
1114 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1115 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1116 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1118 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1119 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1121 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1125 return brw_imm_f(0.0f
);
1126 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1131 return brw_imm_f(-1.0F
);
1133 return brw_imm_f(1.0F
);
1134 } else if (src
->File
== PROGRAM_CONSTANT
) {
1135 const struct gl_program_parameter_list
*params
;
1139 switch (src
->Swizzle
) {
1154 if (component
>= 0) {
1155 params
= c
->vp
->program
.Base
.Parameters
;
1156 f
= params
->ParameterValues
[src
->Index
][component
];
1162 return brw_imm_f(f
);
1168 case PROGRAM_TEMPORARY
:
1170 case PROGRAM_OUTPUT
:
1172 return deref(c
, c
->regs
[file
][0], index
, 32);
1175 assert(c
->regs
[file
][index
].nr
!= 0);
1176 return c
->regs
[file
][index
];
1179 case PROGRAM_STATE_VAR
:
1180 case PROGRAM_CONSTANT
:
1181 case PROGRAM_UNIFORM
:
1182 case PROGRAM_ENV_PARAM
:
1183 case PROGRAM_LOCAL_PARAM
:
1184 if (c
->vp
->use_const_buffer
) {
1185 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1186 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1187 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1189 return get_reladdr_constant(c
, inst
, argIndex
);
1191 return get_constant(c
, inst
, argIndex
);
1194 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1197 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1198 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1200 case PROGRAM_ADDRESS
:
1202 return c
->regs
[file
][index
];
1204 case PROGRAM_UNDEFINED
:
1205 /* this is a normal case since we loop over all three src args */
1206 return brw_null_reg();
1208 case PROGRAM_WRITE_ONLY
:
1211 return brw_null_reg();
1216 * Return the brw reg for the given instruction's src argument.
1217 * Will return mangled results for SWZ op. The emit_swz() function
1218 * ignores this result and recalculates taking extended swizzles into
1221 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1222 const struct prog_instruction
*inst
,
1225 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1228 if (src
->File
== PROGRAM_UNDEFINED
)
1229 return brw_null_reg();
1231 reg
= get_src_reg(c
, inst
, argIndex
);
1233 /* Convert 3-bit swizzle to 2-bit.
1235 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1236 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1237 GET_SWZ(src
->Swizzle
, 1),
1238 GET_SWZ(src
->Swizzle
, 2),
1239 GET_SWZ(src
->Swizzle
, 3));
1242 /* Note this is ok for non-swizzle instructions:
1244 reg
.negate
= src
->Negate
? 1 : 0;
1251 * Get brw register for the given program dest register.
1253 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1254 struct prog_dst_register dst
)
1259 case PROGRAM_TEMPORARY
:
1260 case PROGRAM_OUTPUT
:
1261 /* register-indirect addressing is only 1x1, not VxH, for
1262 * destination regs. So, for RelAddr we'll return a temporary
1263 * for the dest and do a move of the result to the RelAddr
1264 * register after the instruction emit.
1269 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1270 reg
= c
->regs
[dst
.File
][dst
.Index
];
1273 case PROGRAM_ADDRESS
:
1274 assert(dst
.Index
== 0);
1275 reg
= c
->regs
[dst
.File
][dst
.Index
];
1277 case PROGRAM_UNDEFINED
:
1278 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1279 reg
= brw_null_reg();
1283 reg
= brw_null_reg();
1286 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1287 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1293 static void emit_swz( struct brw_vs_compile
*c
,
1295 const struct prog_instruction
*inst
)
1297 const GLuint argIndex
= 0;
1298 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1299 struct brw_compile
*p
= &c
->func
;
1300 GLuint zeros_mask
= 0;
1301 GLuint ones_mask
= 0;
1302 GLuint src_mask
= 0;
1304 GLboolean need_tmp
= (src
.Negate
&&
1305 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1306 struct brw_reg tmp
= dst
;
1312 for (i
= 0; i
< 4; i
++) {
1313 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1314 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1333 /* Do src first, in case dst aliases src:
1336 struct brw_reg arg0
;
1338 arg0
= get_src_reg(c
, inst
, argIndex
);
1340 arg0
= brw_swizzle(arg0
,
1341 src_swz
[0], src_swz
[1],
1342 src_swz
[2], src_swz
[3]);
1344 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1348 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1351 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1354 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1357 brw_MOV(p
, dst
, tmp
);
1358 release_tmp(c
, tmp
);
1364 * Post-vertex-program processing. Send the results to the URB.
1366 static void emit_vertex_write( struct brw_vs_compile
*c
)
1368 struct brw_compile
*p
= &c
->func
;
1369 struct brw_context
*brw
= p
->brw
;
1370 struct intel_context
*intel
= &brw
->intel
;
1371 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1374 GLuint len_vertex_header
= 2;
1377 if (c
->key
.copy_edgeflag
) {
1379 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1380 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1383 if (intel
->gen
< 6) {
1384 /* Build ndc coords */
1386 /* ndc = 1.0 / pos.w */
1387 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1388 /* ndc.xyz = pos * ndc */
1389 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1392 /* Update the header for point size, user clipping flags, and -ve rhw
1395 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1396 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
)
1398 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1401 brw_MOV(p
, header1
, brw_imm_ud(0));
1403 brw_set_access_mode(p
, BRW_ALIGN_16
);
1405 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1406 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1407 if (intel
->gen
< 6) {
1408 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1409 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1411 brw_MOV(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0));
1414 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1415 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1416 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1417 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1418 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1421 /* i965 clipping workaround:
1422 * 1) Test for -ve rhw
1424 * set ndc = (0,0,0,0)
1427 * Later, clipping will detect ucp[6] and ensure the primitive is
1428 * clipped against all fixed planes.
1430 if (brw
->has_negative_rhw_bug
) {
1432 vec8(brw_null_reg()),
1434 brw_swizzle1(ndc
, 3),
1437 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1438 brw_MOV(p
, ndc
, brw_imm_f(0));
1439 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1442 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1443 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1444 brw_set_access_mode(p
, BRW_ALIGN_16
);
1446 release_tmp(c
, header1
);
1449 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1452 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1453 * of zeros followed by two sets of NDC coordinates:
1455 brw_set_access_mode(p
, BRW_ALIGN_1
);
1456 brw_set_acc_write_control(p
, 0);
1458 /* The VUE layout is documented in Volume 2a. */
1459 if (intel
->gen
>= 6) {
1460 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1461 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1462 * dword 4-7 (m2) is the 4D space position
1463 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1464 * enabled. We don't use it, so skip it.
1465 * m3 is the first vertex element data we fill, which is the vertex
1468 brw_MOV(p
, brw_message_reg(2), pos
);
1469 len_vertex_header
= 1;
1470 } else if (intel
->gen
== 5) {
1471 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1472 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1473 * dword 4-7 (m2) is the ndc position (set above)
1474 * dword 8-11 (m3) of the vertex header is the 4D space position
1475 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1476 * m6 is a pad so that the vertex element data is aligned
1477 * m7 is the first vertex data we fill, which is the vertex position.
1479 brw_MOV(p
, brw_message_reg(2), ndc
);
1480 brw_MOV(p
, brw_message_reg(3), pos
);
1481 brw_MOV(p
, brw_message_reg(7), pos
);
1482 len_vertex_header
= 6;
1484 /* There are 8 dwords in VUE header pre-Ironlake:
1485 * dword 0-3 (m1) is indices, point width, clip flags.
1486 * dword 4-7 (m2) is ndc position (set above)
1488 * dword 8-11 (m3) is the first vertex data, which we always have be the
1491 brw_MOV(p
, brw_message_reg(2), ndc
);
1492 brw_MOV(p
, brw_message_reg(3), pos
);
1493 len_vertex_header
= 2;
1496 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1497 next_mrf
= 2 + len_vertex_header
;
1498 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1499 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1501 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1504 if (i
>= VERT_RESULT_TEX0
&&
1505 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1506 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1508 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1509 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1513 eot
= (c
->first_overflow_output
== 0);
1516 brw_null_reg(), /* dest */
1517 0, /* starting mrf reg nr */
1521 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1522 0, /* response len */
1524 eot
, /* writes complete */
1525 0, /* urb destination offset */
1526 BRW_URB_SWIZZLE_INTERLEAVE
);
1528 if (c
->first_overflow_output
> 0) {
1529 /* Not all of the vertex outputs/results fit into the MRF.
1530 * Move the overflowed attributes from the GRF to the MRF and
1531 * issue another brw_urb_WRITE().
1534 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1535 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1536 /* move from GRF to MRF */
1537 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1543 brw_null_reg(), /* dest */
1544 0, /* starting mrf reg nr */
1549 0, /* response len */
1551 1, /* writes complete */
1552 14 / 2, /* urb destination offset */
1553 BRW_URB_SWIZZLE_INTERLEAVE
);
1558 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1560 struct brw_compile
*p
= &c
->func
;
1561 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1563 if (p
->nr_insn
== 0)
1566 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1569 switch (prev_insn
->header
.opcode
) {
1570 case BRW_OPCODE_MOV
:
1571 case BRW_OPCODE_MAC
:
1572 case BRW_OPCODE_MUL
:
1573 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1574 prev_insn
->header
.execution_size
== val
.width
&&
1575 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1576 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1577 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1578 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1579 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1580 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1590 get_predicate(const struct prog_instruction
*inst
)
1592 if (inst
->DstReg
.CondMask
== COND_TR
)
1593 return BRW_PREDICATE_NONE
;
1595 /* All of GLSL only produces predicates for COND_NE and one channel per
1596 * vector. Fail badly if someone starts doing something else, as it might
1597 * mean infinite looping or something.
1599 * We'd like to support all the condition codes, but our hardware doesn't
1600 * quite match the Mesa IR, which is modeled after the NV extensions. For
1601 * those, the instruction may update the condition codes or not, then any
1602 * later instruction may use one of those condition codes. For gen4, the
1603 * instruction may update the flags register based on one of the condition
1604 * codes output by the instruction, and then further instructions may
1605 * predicate on that. We can probably support this, but it won't
1606 * necessarily be easy.
1608 assert(inst
->DstReg
.CondMask
== COND_NE
);
1610 switch (inst
->DstReg
.CondSwizzle
) {
1612 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1614 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1616 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1618 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1620 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1621 inst
->DstReg
.CondMask
);
1622 return BRW_PREDICATE_NORMAL
;
1626 /* Emit the vertex program instructions here.
1628 void brw_vs_emit(struct brw_vs_compile
*c
)
1630 #define MAX_IF_DEPTH 32
1631 #define MAX_LOOP_DEPTH 32
1632 struct brw_compile
*p
= &c
->func
;
1633 struct brw_context
*brw
= p
->brw
;
1634 struct intel_context
*intel
= &brw
->intel
;
1635 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1636 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1637 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1638 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1639 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1643 if (INTEL_DEBUG
& DEBUG_VS
) {
1644 printf("vs-mesa:\n");
1645 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1650 /* FIXME Need to fix conditional instruction to remove this */
1651 if (intel
->gen
>= 6)
1652 p
->single_program_flow
= GL_TRUE
;
1654 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1655 brw_set_access_mode(p
, BRW_ALIGN_16
);
1656 if_depth_in_loop
[loop_depth
] = 0;
1658 brw_set_acc_write_control(p
, 1);
1660 for (insn
= 0; insn
< nr_insns
; insn
++) {
1662 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1664 /* Message registers can't be read, so copy the output into GRF
1665 * register if they are used in source registers
1667 for (i
= 0; i
< 3; i
++) {
1668 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1669 GLuint index
= src
->Index
;
1670 GLuint file
= src
->File
;
1671 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1672 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1675 switch (inst
->Opcode
) {
1678 c
->needs_stack
= GL_TRUE
;
1685 /* Static register allocation
1687 brw_vs_alloc_regs(c
);
1690 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1692 for (insn
= 0; insn
< nr_insns
; insn
++) {
1694 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1695 struct brw_reg args
[3], dst
;
1699 printf("%d: ", insn
);
1700 _mesa_print_instruction(inst
);
1703 /* Get argument regs. SWZ is special and does this itself.
1705 if (inst
->Opcode
!= OPCODE_SWZ
)
1706 for (i
= 0; i
< 3; i
++) {
1707 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1710 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1711 args
[i
] = c
->output_regs
[index
].reg
;
1713 args
[i
] = get_arg(c
, inst
, i
);
1716 /* Get dest regs. Note that it is possible for a reg to be both
1717 * dst and arg, given the static allocation of registers. So
1718 * care needs to be taken emitting multi-operation instructions.
1720 index
= inst
->DstReg
.Index
;
1721 file
= inst
->DstReg
.File
;
1722 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1723 dst
= c
->output_regs
[index
].reg
;
1725 dst
= get_dst(c
, inst
->DstReg
);
1727 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1728 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1729 inst
->SaturateMode
);
1732 switch (inst
->Opcode
) {
1734 brw_MOV(p
, dst
, brw_abs(args
[0]));
1737 brw_ADD(p
, dst
, args
[0], args
[1]);
1740 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1743 brw_DP2(p
, dst
, args
[0], args
[1]);
1746 brw_DP3(p
, dst
, args
[0], args
[1]);
1749 brw_DP4(p
, dst
, args
[0], args
[1]);
1752 brw_DPH(p
, dst
, args
[0], args
[1]);
1755 emit_nrm(c
, dst
, args
[0], 3);
1758 emit_nrm(c
, dst
, args
[0], 4);
1761 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1764 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1767 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1770 brw_RNDD(p
, dst
, args
[0]);
1773 brw_RNDD(p
, dst
, args
[0]);
1776 brw_FRC(p
, dst
, args
[0]);
1779 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1782 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1785 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1788 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1791 if (!accumulator_contains(c
, args
[2]))
1792 brw_MOV(p
, brw_acc_reg(), args
[2]);
1793 brw_MAC(p
, dst
, args
[0], args
[1]);
1796 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1799 emit_max(p
, dst
, args
[0], args
[1]);
1802 emit_min(p
, dst
, args
[0], args
[1]);
1805 brw_MOV(p
, dst
, args
[0]);
1808 brw_MUL(p
, dst
, args
[0], args
[1]);
1811 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1814 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1817 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1821 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1824 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1827 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1830 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1833 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1836 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1839 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1842 unalias1(c
, dst
, args
[0], emit_sign
);
1845 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1848 /* The args[0] value can't be used here as it won't have
1849 * correctly encoded the full swizzle:
1851 emit_swz(c
, dst
, inst
);
1854 /* round toward zero */
1855 brw_RNDZ(p
, dst
, args
[0]);
1858 emit_xpd(p
, dst
, args
[0], args
[1]);
1861 assert(if_depth
< MAX_IF_DEPTH
);
1862 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1863 /* Note that brw_IF smashes the predicate_control field. */
1864 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1865 if_depth_in_loop
[loop_depth
]++;
1869 clear_current_const(c
);
1870 assert(if_depth
> 0);
1871 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1874 clear_current_const(c
);
1875 assert(if_depth
> 0);
1876 brw_ENDIF(p
, if_inst
[--if_depth
]);
1877 if_depth_in_loop
[loop_depth
]--;
1879 case OPCODE_BGNLOOP
:
1880 clear_current_const(c
);
1881 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1882 if_depth_in_loop
[loop_depth
] = 0;
1885 brw_set_predicate_control(p
, get_predicate(inst
));
1886 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
1887 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1890 brw_set_predicate_control(p
, get_predicate(inst
));
1891 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
1892 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1894 case OPCODE_ENDLOOP
:
1896 clear_current_const(c
);
1897 struct brw_instruction
*inst0
, *inst1
;
1902 if (intel
->gen
== 5)
1905 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1906 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1907 while (inst0
> loop_inst
[loop_depth
]) {
1909 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1910 inst0
->bits3
.if_else
.jump_count
== 0) {
1911 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1913 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1914 inst0
->bits3
.if_else
.jump_count
== 0) {
1915 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1921 brw_set_predicate_control(p
, get_predicate(inst
));
1922 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1923 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1926 brw_set_access_mode(p
, BRW_ALIGN_1
);
1927 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1928 brw_set_access_mode(p
, BRW_ALIGN_16
);
1929 brw_ADD(p
, get_addr_reg(stack_index
),
1930 get_addr_reg(stack_index
), brw_imm_d(4));
1931 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1932 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1935 brw_ADD(p
, get_addr_reg(stack_index
),
1936 get_addr_reg(stack_index
), brw_imm_d(-4));
1937 brw_set_access_mode(p
, BRW_ALIGN_1
);
1938 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1939 brw_set_access_mode(p
, BRW_ALIGN_16
);
1942 emit_vertex_write(c
);
1948 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1954 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1955 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1956 _mesa_opcode_string(inst
->Opcode
) :
1960 /* Set the predication update on the last instruction of the native
1961 * instruction sequence.
1963 * This would be problematic if it was set on a math instruction,
1964 * but that shouldn't be the case with the current GLSL compiler.
1966 if (inst
->CondUpdate
) {
1967 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1969 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1970 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1973 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1974 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1975 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1976 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1979 /* Result color clamping.
1981 * When destination register is an output register and
1982 * it's primary/secondary front/back color, we have to clamp
1983 * the result to [0,1]. This is done by enabling the
1984 * saturation bit for the last instruction.
1986 * We don't use brw_set_saturate() as it modifies
1987 * p->current->header.saturate, which affects all the subsequent
1988 * instructions. Instead, we directly modify the header
1989 * of the last (already stored) instruction.
1991 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1992 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1993 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1994 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1995 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1996 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2000 if (inst
->DstReg
.RelAddr
) {
2001 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2002 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2003 move_to_reladdr_dst(c
, inst
, dst
);
2009 brw_resolve_cals(p
);
2013 if (INTEL_DEBUG
& DEBUG_VS
) {
2016 printf("vs-native:\n");
2017 for (i
= 0; i
< p
->nr_insn
; i
++)
2018 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);