2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
120 * Preallocate GRF register before code emit.
121 * Do things as simply as possible. Allocate and populate all regs
124 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
126 struct intel_context
*intel
= &c
->func
.brw
->intel
;
127 GLuint i
, reg
= 0, mrf
;
128 int attributes_in_vue
;
129 int first_reladdr_output
;
131 /* Determine whether to use a real constant buffer or use a block
132 * of GRF registers for constants. The later is faster but only
133 * works if everything fits in the GRF.
134 * XXX this heuristic/check may need some fine tuning...
136 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
137 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
138 c
->vp
->use_const_buffer
= GL_TRUE
;
140 c
->vp
->use_const_buffer
= GL_FALSE
;
142 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
144 /* r0 -- reserved as usual
146 c
->r0
= brw_vec8_grf(reg
, 0);
149 /* User clip planes from curbe:
151 if (c
->key
.nr_userclip
) {
152 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
153 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
156 /* Deal with curbe alignment:
158 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
161 /* Vertex program parameters from curbe:
163 if (c
->vp
->use_const_buffer
) {
164 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
167 /* We've got more constants than we can load with the push
168 * mechanism. This is often correlated with reladdr loads where
169 * we should probably be using a pull mechanism anyway to avoid
170 * excessive reading. However, the pull mechanism is slow in
171 * general. So, we try to allocate as many non-reladdr-loaded
172 * constants through the push buffer as we can before giving up.
174 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
176 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
178 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
181 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
182 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
183 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
184 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
185 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
186 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
187 inst
->SrcReg
[arg
].RelAddr
)
190 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
191 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
196 for (i
= 0; i
< constant
; i
++) {
197 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
201 reg
+= (constant
+ 1) / 2;
202 c
->prog_data
.curb_read_length
= reg
- 1;
203 /* XXX 0 causes a bug elsewhere... */
204 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
207 /* use a section of the GRF for constants */
208 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
209 for (i
= 0; i
< nr_params
; i
++) {
210 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
212 reg
+= (nr_params
+ 1) / 2;
213 c
->prog_data
.curb_read_length
= reg
- 1;
215 c
->prog_data
.nr_params
= nr_params
* 4;
218 /* Allocate input regs:
221 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
222 if (c
->prog_data
.inputs_read
& (1 << i
)) {
224 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
228 /* If there are no inputs, we'll still be reading one attribute's worth
229 * because it's required -- see urb_read_length setting.
231 if (c
->nr_inputs
== 0)
234 /* Allocate outputs. The non-position outputs go straight into message regs.
237 c
->first_output
= reg
;
238 c
->first_overflow_output
= 0;
242 else if (intel
->gen
== 5)
247 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
248 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
249 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
251 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
252 if (i
== VERT_RESULT_HPOS
) {
253 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
256 else if (i
== VERT_RESULT_PSIZ
) {
257 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
259 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
262 /* Two restrictions on our compute-to-MRF here. The
263 * message length for all SEND messages is restricted to
264 * [1,15], so we can't use mrf 15, as that means a length
267 * Additionally, URB writes are aligned to URB rows, so we
268 * need to put an even number of registers of URB data in
269 * each URB write so that the later write is aligned. A
270 * message length of 15 means 1 message header reg plus 14
273 * For attributes beyond the compute-to-MRF, we compute to
274 * GRFs and they will be written in the second URB_WRITE.
276 if (first_reladdr_output
> i
&& mrf
< 15) {
277 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
281 if (mrf
>= 15 && !c
->first_overflow_output
)
282 c
->first_overflow_output
= i
;
283 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
291 /* Allocate program temporaries:
293 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
294 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
298 /* Address reg(s). Don't try to use the internal address reg until
301 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
302 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
306 BRW_VERTICAL_STRIDE_8
,
308 BRW_HORIZONTAL_STRIDE_1
,
314 if (c
->vp
->use_const_buffer
) {
315 for (i
= 0; i
< 3; i
++) {
316 c
->current_const
[i
].index
= -1;
317 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
322 for (i
= 0; i
< 128; i
++) {
323 if (c
->output_regs
[i
].used_in_src
) {
324 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
329 if (c
->needs_stack
) {
330 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
334 /* Some opcodes need an internal temporary:
337 c
->last_tmp
= reg
; /* for allocation purposes */
339 /* Each input reg holds data from two vertices. The
340 * urb_read_length is the number of registers read from *each*
341 * vertex urb, so is half the amount:
343 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
344 /* Setting this field to 0 leads to undefined behavior according to the
345 * the VS_STATE docs. Our VUEs will always have at least one attribute
346 * sitting in them, even if it's padding.
348 if (c
->prog_data
.urb_read_length
== 0)
349 c
->prog_data
.urb_read_length
= 1;
351 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
352 * them to fit the biggest thing they need to.
354 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
356 /* See emit_vertex_write() for where the VUE's overhead on top of the
357 * attributes comes from.
360 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 7) / 8;
361 else if (intel
->gen
== 5)
362 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
364 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
366 c
->prog_data
.total_grf
= reg
;
368 if (INTEL_DEBUG
& DEBUG_VS
) {
369 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
370 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
371 printf("%s reg = %d\n", __FUNCTION__
, reg
);
377 * If an instruction uses a temp reg both as a src and the dest, we
378 * sometimes need to allocate an intermediate temporary.
380 static void unalias1( struct brw_vs_compile
*c
,
383 void (*func
)( struct brw_vs_compile
*,
387 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
388 struct brw_compile
*p
= &c
->func
;
389 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
391 brw_MOV(p
, dst
, tmp
);
401 * Checkes if 2-operand instruction needs an intermediate temporary.
403 static void unalias2( struct brw_vs_compile
*c
,
407 void (*func
)( struct brw_vs_compile
*,
412 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
413 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
414 struct brw_compile
*p
= &c
->func
;
415 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
416 func(c
, tmp
, arg0
, arg1
);
417 brw_MOV(p
, dst
, tmp
);
421 func(c
, dst
, arg0
, arg1
);
427 * Checkes if 3-operand instruction needs an intermediate temporary.
429 static void unalias3( struct brw_vs_compile
*c
,
434 void (*func
)( struct brw_vs_compile
*,
440 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
441 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
442 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
443 struct brw_compile
*p
= &c
->func
;
444 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
445 func(c
, tmp
, arg0
, arg1
, arg2
);
446 brw_MOV(p
, dst
, tmp
);
450 func(c
, dst
, arg0
, arg1
, arg2
);
454 static void emit_sop( struct brw_vs_compile
*c
,
460 struct brw_compile
*p
= &c
->func
;
462 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
463 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
464 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
465 brw_set_predicate_control_flag_value(p
, 0xff);
468 static void emit_seq( struct brw_vs_compile
*c
,
471 struct brw_reg arg1
)
473 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
476 static void emit_sne( struct brw_vs_compile
*c
,
479 struct brw_reg arg1
)
481 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
483 static void emit_slt( struct brw_vs_compile
*c
,
486 struct brw_reg arg1
)
488 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
491 static void emit_sle( struct brw_vs_compile
*c
,
494 struct brw_reg arg1
)
496 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
499 static void emit_sgt( struct brw_vs_compile
*c
,
502 struct brw_reg arg1
)
504 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
507 static void emit_sge( struct brw_vs_compile
*c
,
510 struct brw_reg arg1
)
512 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
515 static void emit_cmp( struct brw_compile
*p
,
519 struct brw_reg arg2
)
521 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
522 brw_SEL(p
, dst
, arg1
, arg2
);
523 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
526 static void emit_sign(struct brw_vs_compile
*c
,
530 struct brw_compile
*p
= &c
->func
;
532 brw_MOV(p
, dst
, brw_imm_f(0));
534 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
535 brw_MOV(p
, dst
, brw_imm_f(-1.0));
536 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
538 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
539 brw_MOV(p
, dst
, brw_imm_f(1.0));
540 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
543 static void emit_max( struct brw_compile
*p
,
546 struct brw_reg arg1
)
548 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
549 brw_SEL(p
, dst
, arg0
, arg1
);
550 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
553 static void emit_min( struct brw_compile
*p
,
556 struct brw_reg arg1
)
558 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
559 brw_SEL(p
, dst
, arg0
, arg1
);
560 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
564 static void emit_math1( struct brw_vs_compile
*c
,
570 /* There are various odd behaviours with SEND on the simulator. In
571 * addition there are documented issues with the fact that the GEN4
572 * processor doesn't do dependency control properly on SEND
573 * results. So, on balance, this kludge to get around failures
574 * with writemasked math results looks like it might be necessary
575 * whether that turns out to be a simulator bug or not:
577 struct brw_compile
*p
= &c
->func
;
578 struct intel_context
*intel
= &p
->brw
->intel
;
579 struct brw_reg tmp
= dst
;
580 GLboolean need_tmp
= (intel
->gen
< 6 &&
581 (dst
.dw1
.bits
.writemask
!= 0xf ||
582 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
590 BRW_MATH_SATURATE_NONE
,
593 BRW_MATH_DATA_SCALAR
,
597 brw_MOV(p
, dst
, tmp
);
603 static void emit_math2( struct brw_vs_compile
*c
,
610 struct brw_compile
*p
= &c
->func
;
611 struct intel_context
*intel
= &p
->brw
->intel
;
612 struct brw_reg tmp
= dst
;
613 GLboolean need_tmp
= (intel
->gen
< 6 &&
614 (dst
.dw1
.bits
.writemask
!= 0xf ||
615 dst
.file
!= BRW_GENERAL_REGISTER_FILE
));
620 brw_MOV(p
, brw_message_reg(3), arg1
);
625 BRW_MATH_SATURATE_NONE
,
628 BRW_MATH_DATA_SCALAR
,
632 brw_MOV(p
, dst
, tmp
);
638 static void emit_exp_noalias( struct brw_vs_compile
*c
,
640 struct brw_reg arg0
)
642 struct brw_compile
*p
= &c
->func
;
645 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
646 struct brw_reg tmp
= get_tmp(c
);
647 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
649 /* tmp_d = floor(arg0.x) */
650 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
652 /* result[0] = 2.0 ^ tmp */
654 /* Adjust exponent for floating point:
657 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
659 /* Install exponent and sign.
660 * Excess drops off the edge:
662 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
663 tmp_d
, brw_imm_d(23));
668 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
669 /* result[1] = arg0.x - floor(arg0.x) */
670 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
673 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
674 /* As with the LOG instruction, we might be better off just
675 * doing a taylor expansion here, seeing as we have to do all
678 * If mathbox partial precision is too low, consider also:
679 * result[3] = result[0] * EXP(result[1])
682 BRW_MATH_FUNCTION_EXP
,
683 brw_writemask(dst
, WRITEMASK_Z
),
684 brw_swizzle1(arg0
, 0),
685 BRW_MATH_PRECISION_FULL
);
688 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
689 /* result[3] = 1.0; */
690 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
695 static void emit_log_noalias( struct brw_vs_compile
*c
,
697 struct brw_reg arg0
)
699 struct brw_compile
*p
= &c
->func
;
700 struct brw_reg tmp
= dst
;
701 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
702 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
703 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
704 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
708 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
711 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
714 * These almost look likey they could be joined up, but not really
717 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
718 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
720 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
722 brw_writemask(tmp_ud
, WRITEMASK_X
),
723 brw_swizzle1(arg0_ud
, 0),
724 brw_imm_ud((1U<<31)-1));
727 brw_writemask(tmp_ud
, WRITEMASK_X
),
732 brw_writemask(tmp
, WRITEMASK_X
),
733 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
737 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
739 brw_writemask(tmp_ud
, WRITEMASK_Y
),
740 brw_swizzle1(arg0_ud
, 0),
741 brw_imm_ud((1<<23)-1));
744 brw_writemask(tmp_ud
, WRITEMASK_Y
),
746 brw_imm_ud(127<<23));
749 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
750 /* result[2] = result[0] + LOG2(result[1]); */
752 /* Why bother? The above is just a hint how to do this with a
753 * taylor series. Maybe we *should* use a taylor series as by
754 * the time all the above has been done it's almost certainly
755 * quicker than calling the mathbox, even with low precision.
758 * - result[0] + mathbox.LOG2(result[1])
759 * - mathbox.LOG2(arg0.x)
760 * - result[0] + inline_taylor_approx(result[1])
763 BRW_MATH_FUNCTION_LOG
,
764 brw_writemask(tmp
, WRITEMASK_Z
),
765 brw_swizzle1(tmp
, 1),
766 BRW_MATH_PRECISION_FULL
);
769 brw_writemask(tmp
, WRITEMASK_Z
),
770 brw_swizzle1(tmp
, 2),
771 brw_swizzle1(tmp
, 0));
774 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
775 /* result[3] = 1.0; */
776 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
780 brw_MOV(p
, dst
, tmp
);
786 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
788 static void emit_dst_noalias( struct brw_vs_compile
*c
,
793 struct brw_compile
*p
= &c
->func
;
795 /* There must be a better way to do this:
797 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
798 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
799 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
800 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
801 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
802 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
803 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
804 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
808 static void emit_xpd( struct brw_compile
*p
,
813 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
814 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
818 static void emit_lit_noalias( struct brw_vs_compile
*c
,
820 struct brw_reg arg0
)
822 struct brw_compile
*p
= &c
->func
;
823 struct brw_instruction
*if_insn
;
824 struct brw_reg tmp
= dst
;
825 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
830 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
831 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
833 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
834 * to get all channels active inside the IF. In the clipping code
835 * we run with NoMask, so it's not an option and we can use
836 * BRW_EXECUTE_1 for all comparisions.
838 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
839 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
841 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
843 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
844 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
845 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
848 BRW_MATH_FUNCTION_POW
,
849 brw_writemask(dst
, WRITEMASK_Z
),
850 brw_swizzle1(tmp
, 2),
851 brw_swizzle1(arg0
, 3),
852 BRW_MATH_PRECISION_PARTIAL
);
855 brw_ENDIF(p
, if_insn
);
860 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
866 struct brw_compile
*p
= &c
->func
;
868 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
869 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
870 brw_MAC(p
, dst
, arg0
, arg1
);
873 /** 3 or 4-component vector normalization */
874 static void emit_nrm( struct brw_vs_compile
*c
,
879 struct brw_compile
*p
= &c
->func
;
880 struct brw_reg tmp
= get_tmp(c
);
882 /* tmp = dot(arg0, arg0) */
884 brw_DP3(p
, tmp
, arg0
, arg0
);
886 brw_DP4(p
, tmp
, arg0
, arg0
);
888 /* tmp = 1 / sqrt(tmp) */
889 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
891 /* dst = arg0 * tmp */
892 brw_MUL(p
, dst
, arg0
, tmp
);
898 static struct brw_reg
899 get_constant(struct brw_vs_compile
*c
,
900 const struct prog_instruction
*inst
,
903 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
904 struct brw_compile
*p
= &c
->func
;
905 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
907 assert(argIndex
< 3);
909 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
910 /* Keep track of the last constant loaded in this slot, for reuse. */
911 c
->current_const
[argIndex
].index
= src
->Index
;
914 printf(" fetch const[%d] for arg %d into reg %d\n",
915 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
917 /* need to fetch the constant now */
919 const_reg
, /* writeback dest */
920 16 * src
->Index
, /* byte offset */
921 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
925 /* replicate lower four floats into upper half (to get XYZWXYZW) */
926 const_reg
= stride(const_reg
, 0, 4, 0);
932 static struct brw_reg
933 get_reladdr_constant(struct brw_vs_compile
*c
,
934 const struct prog_instruction
*inst
,
937 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
938 struct brw_compile
*p
= &c
->func
;
939 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
940 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
941 struct brw_reg byte_addr_reg
= get_tmp(c
);
943 assert(argIndex
< 3);
945 /* Can't reuse a reladdr constant load. */
946 c
->current_const
[argIndex
].index
= -1;
949 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
950 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
953 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
955 /* fetch the first vec4 */
956 brw_dp_READ_4_vs_relative(p
,
957 const_reg
, /* writeback dest */
958 byte_addr_reg
, /* address register */
959 16 * src
->Index
, /* byte offset */
960 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
968 /* TODO: relative addressing!
970 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
971 gl_register_file file
,
975 case PROGRAM_TEMPORARY
:
978 assert(c
->regs
[file
][index
].nr
!= 0);
979 return c
->regs
[file
][index
];
980 case PROGRAM_STATE_VAR
:
981 case PROGRAM_CONSTANT
:
982 case PROGRAM_UNIFORM
:
983 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
984 return c
->regs
[PROGRAM_STATE_VAR
][index
];
985 case PROGRAM_ADDRESS
:
987 return c
->regs
[file
][index
];
989 case PROGRAM_UNDEFINED
: /* undef values */
990 return brw_null_reg();
992 case PROGRAM_LOCAL_PARAM
:
993 case PROGRAM_ENV_PARAM
:
994 case PROGRAM_WRITE_ONLY
:
997 return brw_null_reg();
1003 * Indirect addressing: get reg[[arg] + offset].
1005 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1010 struct brw_compile
*p
= &c
->func
;
1011 struct brw_reg tmp
= get_tmp(c
);
1012 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1013 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1014 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1015 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1016 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1018 /* Set the vertical stride on the register access so that the first
1019 * 4 components come from a0.0 and the second 4 from a0.1.
1021 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1024 brw_push_insn_state(p
);
1025 brw_set_access_mode(p
, BRW_ALIGN_1
);
1027 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1028 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1030 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1031 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1033 brw_MOV(p
, tmp
, indirect
);
1035 brw_pop_insn_state(p
);
1038 /* NOTE: tmp not released */
1043 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1044 const struct prog_instruction
*inst
,
1047 struct brw_compile
*p
= &c
->func
;
1049 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1050 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1051 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1052 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1053 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1054 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1056 brw_push_insn_state(p
);
1057 brw_set_access_mode(p
, BRW_ALIGN_1
);
1059 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1060 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1061 brw_MOV(p
, indirect
, val
);
1063 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1064 brw_ADD(p
, brw_address_reg(0), acc
,
1065 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1066 brw_MOV(p
, indirect
, suboffset(val
, 4));
1068 brw_pop_insn_state(p
);
1072 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1073 * TODO: relative addressing!
1075 static struct brw_reg
1076 get_src_reg( struct brw_vs_compile
*c
,
1077 const struct prog_instruction
*inst
,
1080 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1081 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1082 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1084 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1085 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1087 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1091 return brw_imm_f(0.0f
);
1092 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1097 return brw_imm_f(-1.0F
);
1099 return brw_imm_f(1.0F
);
1100 } else if (src
->File
== PROGRAM_CONSTANT
) {
1101 const struct gl_program_parameter_list
*params
;
1105 switch (src
->Swizzle
) {
1120 if (component
>= 0) {
1121 params
= c
->vp
->program
.Base
.Parameters
;
1122 f
= params
->ParameterValues
[src
->Index
][component
];
1128 return brw_imm_f(f
);
1134 case PROGRAM_TEMPORARY
:
1136 case PROGRAM_OUTPUT
:
1138 return deref(c
, c
->regs
[file
][0], index
, 32);
1141 assert(c
->regs
[file
][index
].nr
!= 0);
1142 return c
->regs
[file
][index
];
1145 case PROGRAM_STATE_VAR
:
1146 case PROGRAM_CONSTANT
:
1147 case PROGRAM_UNIFORM
:
1148 case PROGRAM_ENV_PARAM
:
1149 case PROGRAM_LOCAL_PARAM
:
1150 if (c
->vp
->use_const_buffer
) {
1151 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1152 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1153 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1155 return get_reladdr_constant(c
, inst
, argIndex
);
1157 return get_constant(c
, inst
, argIndex
);
1160 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1163 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1164 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1166 case PROGRAM_ADDRESS
:
1168 return c
->regs
[file
][index
];
1170 case PROGRAM_UNDEFINED
:
1171 /* this is a normal case since we loop over all three src args */
1172 return brw_null_reg();
1174 case PROGRAM_WRITE_ONLY
:
1177 return brw_null_reg();
1182 * Return the brw reg for the given instruction's src argument.
1183 * Will return mangled results for SWZ op. The emit_swz() function
1184 * ignores this result and recalculates taking extended swizzles into
1187 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1188 const struct prog_instruction
*inst
,
1191 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1194 if (src
->File
== PROGRAM_UNDEFINED
)
1195 return brw_null_reg();
1197 reg
= get_src_reg(c
, inst
, argIndex
);
1199 /* Convert 3-bit swizzle to 2-bit.
1201 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1202 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1203 GET_SWZ(src
->Swizzle
, 1),
1204 GET_SWZ(src
->Swizzle
, 2),
1205 GET_SWZ(src
->Swizzle
, 3));
1208 /* Note this is ok for non-swizzle instructions:
1210 reg
.negate
= src
->Negate
? 1 : 0;
1217 * Get brw register for the given program dest register.
1219 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1220 struct prog_dst_register dst
)
1225 case PROGRAM_TEMPORARY
:
1226 case PROGRAM_OUTPUT
:
1227 /* register-indirect addressing is only 1x1, not VxH, for
1228 * destination regs. So, for RelAddr we'll return a temporary
1229 * for the dest and do a move of the result to the RelAddr
1230 * register after the instruction emit.
1235 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1236 reg
= c
->regs
[dst
.File
][dst
.Index
];
1239 case PROGRAM_ADDRESS
:
1240 assert(dst
.Index
== 0);
1241 reg
= c
->regs
[dst
.File
][dst
.Index
];
1243 case PROGRAM_UNDEFINED
:
1244 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1245 reg
= brw_null_reg();
1249 reg
= brw_null_reg();
1252 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1253 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1259 static void emit_swz( struct brw_vs_compile
*c
,
1261 const struct prog_instruction
*inst
)
1263 const GLuint argIndex
= 0;
1264 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1265 struct brw_compile
*p
= &c
->func
;
1266 GLuint zeros_mask
= 0;
1267 GLuint ones_mask
= 0;
1268 GLuint src_mask
= 0;
1270 GLboolean need_tmp
= (src
.Negate
&&
1271 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1272 struct brw_reg tmp
= dst
;
1278 for (i
= 0; i
< 4; i
++) {
1279 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1280 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1299 /* Do src first, in case dst aliases src:
1302 struct brw_reg arg0
;
1304 arg0
= get_src_reg(c
, inst
, argIndex
);
1306 arg0
= brw_swizzle(arg0
,
1307 src_swz
[0], src_swz
[1],
1308 src_swz
[2], src_swz
[3]);
1310 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1314 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1317 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1320 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1323 brw_MOV(p
, dst
, tmp
);
1324 release_tmp(c
, tmp
);
1330 * Post-vertex-program processing. Send the results to the URB.
1332 static void emit_vertex_write( struct brw_vs_compile
*c
)
1334 struct brw_compile
*p
= &c
->func
;
1335 struct brw_context
*brw
= p
->brw
;
1336 struct intel_context
*intel
= &brw
->intel
;
1337 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1340 GLuint len_vertex_header
= 2;
1343 if (c
->key
.copy_edgeflag
) {
1345 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1346 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1349 if (intel
->gen
< 6) {
1350 /* Build ndc coords */
1352 /* ndc = 1.0 / pos.w */
1353 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1354 /* ndc.xyz = pos * ndc */
1355 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1358 /* Update the header for point size, user clipping flags, and -ve rhw
1361 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1362 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
)
1364 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1367 brw_MOV(p
, header1
, brw_imm_ud(0));
1369 brw_set_access_mode(p
, BRW_ALIGN_16
);
1371 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1372 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1373 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1374 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1377 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1378 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1379 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1380 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1381 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1384 /* i965 clipping workaround:
1385 * 1) Test for -ve rhw
1387 * set ndc = (0,0,0,0)
1390 * Later, clipping will detect ucp[6] and ensure the primitive is
1391 * clipped against all fixed planes.
1393 if (brw
->has_negative_rhw_bug
) {
1395 vec8(brw_null_reg()),
1397 brw_swizzle1(ndc
, 3),
1400 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1401 brw_MOV(p
, ndc
, brw_imm_f(0));
1402 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1405 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1406 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1407 brw_set_access_mode(p
, BRW_ALIGN_16
);
1409 release_tmp(c
, header1
);
1412 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1415 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1416 * of zeros followed by two sets of NDC coordinates:
1418 brw_set_access_mode(p
, BRW_ALIGN_1
);
1419 brw_set_acc_write_control(p
, 0);
1421 /* The VUE layout is documented in Volume 2a. */
1422 if (intel
->gen
>= 6) {
1423 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1424 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1425 * dword 4-7 (m2) is the 4D space position
1426 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1427 * enabled. We don't use it, so skip it.
1428 * m3 is the first vertex element data we fill, which is the vertex
1431 brw_MOV(p
, brw_message_reg(2), pos
);
1432 brw_MOV(p
, brw_message_reg(3), pos
);
1433 len_vertex_header
= 2;
1434 } else if (intel
->gen
== 5) {
1435 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1436 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1437 * dword 4-7 (m2) is the ndc position (set above)
1438 * dword 8-11 (m3) of the vertex header is the 4D space position
1439 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1440 * m6 is a pad so that the vertex element data is aligned
1441 * m7 is the first vertex data we fill, which is the vertex position.
1443 brw_MOV(p
, brw_message_reg(2), ndc
);
1444 brw_MOV(p
, brw_message_reg(3), pos
);
1445 brw_MOV(p
, brw_message_reg(7), pos
);
1446 len_vertex_header
= 6;
1448 /* There are 8 dwords in VUE header pre-Ironlake:
1449 * dword 0-3 (m1) is indices, point width, clip flags.
1450 * dword 4-7 (m2) is ndc position (set above)
1452 * dword 8-11 (m3) is the first vertex data, which we always have be the
1455 brw_MOV(p
, brw_message_reg(2), ndc
);
1456 brw_MOV(p
, brw_message_reg(3), pos
);
1457 len_vertex_header
= 2;
1460 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1461 next_mrf
= 2 + len_vertex_header
;
1462 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1463 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1465 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1468 if (i
>= VERT_RESULT_TEX0
&&
1469 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1470 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1472 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1473 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1477 eot
= (c
->first_overflow_output
== 0);
1480 brw_null_reg(), /* dest */
1481 0, /* starting mrf reg nr */
1485 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1486 0, /* response len */
1488 eot
, /* writes complete */
1489 0, /* urb destination offset */
1490 BRW_URB_SWIZZLE_INTERLEAVE
);
1492 if (c
->first_overflow_output
> 0) {
1493 /* Not all of the vertex outputs/results fit into the MRF.
1494 * Move the overflowed attributes from the GRF to the MRF and
1495 * issue another brw_urb_WRITE().
1498 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1499 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1500 /* move from GRF to MRF */
1501 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1507 brw_null_reg(), /* dest */
1508 0, /* starting mrf reg nr */
1513 0, /* response len */
1515 1, /* writes complete */
1516 14 / 2, /* urb destination offset */
1517 BRW_URB_SWIZZLE_INTERLEAVE
);
1522 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1524 struct brw_compile
*p
= &c
->func
;
1525 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1527 if (p
->nr_insn
== 0)
1530 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1533 switch (prev_insn
->header
.opcode
) {
1534 case BRW_OPCODE_MOV
:
1535 case BRW_OPCODE_MAC
:
1536 case BRW_OPCODE_MUL
:
1537 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1538 prev_insn
->header
.execution_size
== val
.width
&&
1539 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1540 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1541 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1542 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1543 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1544 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1554 get_predicate(const struct prog_instruction
*inst
)
1556 if (inst
->DstReg
.CondMask
== COND_TR
)
1557 return BRW_PREDICATE_NONE
;
1559 /* All of GLSL only produces predicates for COND_NE and one channel per
1560 * vector. Fail badly if someone starts doing something else, as it might
1561 * mean infinite looping or something.
1563 * We'd like to support all the condition codes, but our hardware doesn't
1564 * quite match the Mesa IR, which is modeled after the NV extensions. For
1565 * those, the instruction may update the condition codes or not, then any
1566 * later instruction may use one of those condition codes. For gen4, the
1567 * instruction may update the flags register based on one of the condition
1568 * codes output by the instruction, and then further instructions may
1569 * predicate on that. We can probably support this, but it won't
1570 * necessarily be easy.
1572 assert(inst
->DstReg
.CondMask
== COND_NE
);
1574 switch (inst
->DstReg
.CondSwizzle
) {
1576 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1578 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1580 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1582 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1584 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1585 inst
->DstReg
.CondMask
);
1586 return BRW_PREDICATE_NORMAL
;
1590 /* Emit the vertex program instructions here.
1592 void brw_vs_emit(struct brw_vs_compile
*c
)
1594 #define MAX_IF_DEPTH 32
1595 #define MAX_LOOP_DEPTH 32
1596 struct brw_compile
*p
= &c
->func
;
1597 struct brw_context
*brw
= p
->brw
;
1598 struct intel_context
*intel
= &brw
->intel
;
1599 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1600 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1601 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1602 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1603 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1607 if (INTEL_DEBUG
& DEBUG_VS
) {
1608 printf("vs-mesa:\n");
1609 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1614 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1615 brw_set_access_mode(p
, BRW_ALIGN_16
);
1616 if_depth_in_loop
[loop_depth
] = 0;
1618 brw_set_acc_write_control(p
, 1);
1620 for (insn
= 0; insn
< nr_insns
; insn
++) {
1622 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1624 /* Message registers can't be read, so copy the output into GRF
1625 * register if they are used in source registers
1627 for (i
= 0; i
< 3; i
++) {
1628 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1629 GLuint index
= src
->Index
;
1630 GLuint file
= src
->File
;
1631 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1632 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1635 switch (inst
->Opcode
) {
1638 c
->needs_stack
= GL_TRUE
;
1645 /* Static register allocation
1647 brw_vs_alloc_regs(c
);
1650 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1652 for (insn
= 0; insn
< nr_insns
; insn
++) {
1654 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1655 struct brw_reg args
[3], dst
;
1659 printf("%d: ", insn
);
1660 _mesa_print_instruction(inst
);
1663 /* Get argument regs. SWZ is special and does this itself.
1665 if (inst
->Opcode
!= OPCODE_SWZ
)
1666 for (i
= 0; i
< 3; i
++) {
1667 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1670 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1671 args
[i
] = c
->output_regs
[index
].reg
;
1673 args
[i
] = get_arg(c
, inst
, i
);
1676 /* Get dest regs. Note that it is possible for a reg to be both
1677 * dst and arg, given the static allocation of registers. So
1678 * care needs to be taken emitting multi-operation instructions.
1680 index
= inst
->DstReg
.Index
;
1681 file
= inst
->DstReg
.File
;
1682 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1683 dst
= c
->output_regs
[index
].reg
;
1685 dst
= get_dst(c
, inst
->DstReg
);
1687 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1688 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1689 inst
->SaturateMode
);
1692 switch (inst
->Opcode
) {
1694 brw_MOV(p
, dst
, brw_abs(args
[0]));
1697 brw_ADD(p
, dst
, args
[0], args
[1]);
1700 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1703 brw_DP2(p
, dst
, args
[0], args
[1]);
1706 brw_DP3(p
, dst
, args
[0], args
[1]);
1709 brw_DP4(p
, dst
, args
[0], args
[1]);
1712 brw_DPH(p
, dst
, args
[0], args
[1]);
1715 emit_nrm(c
, dst
, args
[0], 3);
1718 emit_nrm(c
, dst
, args
[0], 4);
1721 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1724 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1727 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1730 brw_RNDD(p
, dst
, args
[0]);
1733 brw_RNDD(p
, dst
, args
[0]);
1736 brw_FRC(p
, dst
, args
[0]);
1739 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1742 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1745 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1748 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1751 if (!accumulator_contains(c
, args
[2]))
1752 brw_MOV(p
, brw_acc_reg(), args
[2]);
1753 brw_MAC(p
, dst
, args
[0], args
[1]);
1756 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1759 emit_max(p
, dst
, args
[0], args
[1]);
1762 emit_min(p
, dst
, args
[0], args
[1]);
1765 brw_MOV(p
, dst
, args
[0]);
1768 brw_MUL(p
, dst
, args
[0], args
[1]);
1771 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1774 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1777 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1781 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1784 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1787 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1790 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1793 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1796 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1799 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1802 unalias1(c
, dst
, args
[0], emit_sign
);
1805 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1808 /* The args[0] value can't be used here as it won't have
1809 * correctly encoded the full swizzle:
1811 emit_swz(c
, dst
, inst
);
1814 /* round toward zero */
1815 brw_RNDZ(p
, dst
, args
[0]);
1818 emit_xpd(p
, dst
, args
[0], args
[1]);
1821 assert(if_depth
< MAX_IF_DEPTH
);
1822 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1823 /* Note that brw_IF smashes the predicate_control field. */
1824 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1825 if_depth_in_loop
[loop_depth
]++;
1829 assert(if_depth
> 0);
1830 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1833 assert(if_depth
> 0);
1834 brw_ENDIF(p
, if_inst
[--if_depth
]);
1835 if_depth_in_loop
[loop_depth
]--;
1837 case OPCODE_BGNLOOP
:
1838 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1839 if_depth_in_loop
[loop_depth
] = 0;
1842 brw_set_predicate_control(p
, get_predicate(inst
));
1843 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
1844 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1847 brw_set_predicate_control(p
, get_predicate(inst
));
1848 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
1849 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1851 case OPCODE_ENDLOOP
:
1853 struct brw_instruction
*inst0
, *inst1
;
1858 if (intel
->gen
== 5)
1861 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1862 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1863 while (inst0
> loop_inst
[loop_depth
]) {
1865 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1866 inst0
->bits3
.if_else
.jump_count
== 0) {
1867 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1869 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1870 inst0
->bits3
.if_else
.jump_count
== 0) {
1871 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1877 brw_set_predicate_control(p
, get_predicate(inst
));
1878 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1879 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1882 brw_set_access_mode(p
, BRW_ALIGN_1
);
1883 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1884 brw_set_access_mode(p
, BRW_ALIGN_16
);
1885 brw_ADD(p
, get_addr_reg(stack_index
),
1886 get_addr_reg(stack_index
), brw_imm_d(4));
1887 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1888 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1891 brw_ADD(p
, get_addr_reg(stack_index
),
1892 get_addr_reg(stack_index
), brw_imm_d(-4));
1893 brw_set_access_mode(p
, BRW_ALIGN_1
);
1894 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1895 brw_set_access_mode(p
, BRW_ALIGN_16
);
1898 emit_vertex_write(c
);
1904 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1910 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1911 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1912 _mesa_opcode_string(inst
->Opcode
) :
1916 /* Set the predication update on the last instruction of the native
1917 * instruction sequence.
1919 * This would be problematic if it was set on a math instruction,
1920 * but that shouldn't be the case with the current GLSL compiler.
1922 if (inst
->CondUpdate
) {
1923 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1925 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1926 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1929 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1930 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1931 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1932 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1935 /* Result color clamping.
1937 * When destination register is an output register and
1938 * it's primary/secondary front/back color, we have to clamp
1939 * the result to [0,1]. This is done by enabling the
1940 * saturation bit for the last instruction.
1942 * We don't use brw_set_saturate() as it modifies
1943 * p->current->header.saturate, which affects all the subsequent
1944 * instructions. Instead, we directly modify the header
1945 * of the last (already stored) instruction.
1947 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1948 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1949 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1950 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1951 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1952 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1956 if (inst
->DstReg
.RelAddr
) {
1957 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
1958 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
1959 move_to_reladdr_dst(c
, inst
, dst
);
1965 brw_resolve_cals(p
);
1969 if (INTEL_DEBUG
& DEBUG_VS
) {
1972 printf("vs-native:\n");
1973 for (i
= 0; i
< p
->nr_insn
; i
++)
1974 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);