2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
152 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
153 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
154 c
->vp
->use_const_buffer
= GL_TRUE
;
156 c
->vp
->use_const_buffer
= GL_FALSE
;
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
160 /* r0 -- reserved as usual
162 c
->r0
= brw_vec8_grf(reg
, 0);
165 /* User clip planes from curbe:
167 if (c
->key
.nr_userclip
) {
168 if (intel
->gen
>= 6) {
169 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
170 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
171 (i
% 2) * 4), 0, 4, 1);
173 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
175 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
176 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
177 (i
% 2) * 4), 0, 4, 1);
179 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
184 /* Vertex program parameters from curbe:
186 if (c
->vp
->use_const_buffer
) {
187 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
190 /* We've got more constants than we can load with the push
191 * mechanism. This is often correlated with reladdr loads where
192 * we should probably be using a pull mechanism anyway to avoid
193 * excessive reading. However, the pull mechanism is slow in
194 * general. So, we try to allocate as many non-reladdr-loaded
195 * constants through the push buffer as we can before giving up.
197 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
199 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
201 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
204 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
205 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
206 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
207 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
208 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
209 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
210 inst
->SrcReg
[arg
].RelAddr
)
213 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
214 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
219 for (i
= 0; i
< constant
; i
++) {
220 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
224 reg
+= (constant
+ 1) / 2;
225 c
->prog_data
.curb_read_length
= reg
- 1;
226 /* XXX 0 causes a bug elsewhere... */
227 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
230 /* use a section of the GRF for constants */
231 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
232 for (i
= 0; i
< nr_params
; i
++) {
233 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
235 reg
+= (nr_params
+ 1) / 2;
236 c
->prog_data
.curb_read_length
= reg
- 1;
238 c
->prog_data
.nr_params
= nr_params
* 4;
241 /* Allocate input regs:
244 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
245 if (c
->prog_data
.inputs_read
& (1 << i
)) {
247 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
251 /* If there are no inputs, we'll still be reading one attribute's worth
252 * because it's required -- see urb_read_length setting.
254 if (c
->nr_inputs
== 0)
257 /* Allocate outputs. The non-position outputs go straight into message regs.
260 c
->first_output
= reg
;
261 c
->first_overflow_output
= 0;
263 if (intel
->gen
>= 6) {
265 if (c
->key
.nr_userclip
)
267 } else if (intel
->gen
== 5)
272 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
273 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
274 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
276 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
277 if (i
== VERT_RESULT_HPOS
) {
278 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
281 else if (i
== VERT_RESULT_PSIZ
) {
282 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
284 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
287 /* Two restrictions on our compute-to-MRF here. The
288 * message length for all SEND messages is restricted to
289 * [1,15], so we can't use mrf 15, as that means a length
292 * Additionally, URB writes are aligned to URB rows, so we
293 * need to put an even number of registers of URB data in
294 * each URB write so that the later write is aligned. A
295 * message length of 15 means 1 message header reg plus 14
298 * For attributes beyond the compute-to-MRF, we compute to
299 * GRFs and they will be written in the second URB_WRITE.
301 if (first_reladdr_output
> i
&& mrf
< 15) {
302 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
306 if (mrf
>= 15 && !c
->first_overflow_output
)
307 c
->first_overflow_output
= i
;
308 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
316 /* Allocate program temporaries:
318 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
319 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
323 /* Address reg(s). Don't try to use the internal address reg until
326 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
327 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
331 BRW_VERTICAL_STRIDE_8
,
333 BRW_HORIZONTAL_STRIDE_1
,
339 if (c
->vp
->use_const_buffer
) {
340 for (i
= 0; i
< 3; i
++) {
341 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
344 clear_current_const(c
);
347 for (i
= 0; i
< 128; i
++) {
348 if (c
->output_regs
[i
].used_in_src
) {
349 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
354 if (c
->needs_stack
) {
355 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
359 /* Some opcodes need an internal temporary:
362 c
->last_tmp
= reg
; /* for allocation purposes */
364 /* Each input reg holds data from two vertices. The
365 * urb_read_length is the number of registers read from *each*
366 * vertex urb, so is half the amount:
368 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
369 /* Setting this field to 0 leads to undefined behavior according to the
370 * the VS_STATE docs. Our VUEs will always have at least one attribute
371 * sitting in them, even if it's padding.
373 if (c
->prog_data
.urb_read_length
== 0)
374 c
->prog_data
.urb_read_length
= 1;
376 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
377 * them to fit the biggest thing they need to.
379 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
381 /* See emit_vertex_write() for where the VUE's overhead on top of the
382 * attributes comes from.
384 if (intel
->gen
>= 6) {
386 if (c
->key
.nr_userclip
)
389 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
390 } else if (intel
->gen
== 5)
391 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
393 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
395 c
->prog_data
.total_grf
= reg
;
397 if (INTEL_DEBUG
& DEBUG_VS
) {
398 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
399 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
400 printf("%s reg = %d\n", __FUNCTION__
, reg
);
406 * If an instruction uses a temp reg both as a src and the dest, we
407 * sometimes need to allocate an intermediate temporary.
409 static void unalias1( struct brw_vs_compile
*c
,
412 void (*func
)( struct brw_vs_compile
*,
416 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
417 struct brw_compile
*p
= &c
->func
;
418 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
420 brw_MOV(p
, dst
, tmp
);
430 * Checkes if 2-operand instruction needs an intermediate temporary.
432 static void unalias2( struct brw_vs_compile
*c
,
436 void (*func
)( struct brw_vs_compile
*,
441 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
442 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
443 struct brw_compile
*p
= &c
->func
;
444 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
445 func(c
, tmp
, arg0
, arg1
);
446 brw_MOV(p
, dst
, tmp
);
450 func(c
, dst
, arg0
, arg1
);
456 * Checkes if 3-operand instruction needs an intermediate temporary.
458 static void unalias3( struct brw_vs_compile
*c
,
463 void (*func
)( struct brw_vs_compile
*,
469 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
470 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
471 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
472 struct brw_compile
*p
= &c
->func
;
473 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
474 func(c
, tmp
, arg0
, arg1
, arg2
);
475 brw_MOV(p
, dst
, tmp
);
479 func(c
, dst
, arg0
, arg1
, arg2
);
483 static void emit_sop( struct brw_vs_compile
*c
,
489 struct brw_compile
*p
= &c
->func
;
491 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
492 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
493 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
494 brw_set_predicate_control_flag_value(p
, 0xff);
497 static void emit_seq( struct brw_vs_compile
*c
,
500 struct brw_reg arg1
)
502 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
505 static void emit_sne( struct brw_vs_compile
*c
,
508 struct brw_reg arg1
)
510 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
512 static void emit_slt( struct brw_vs_compile
*c
,
515 struct brw_reg arg1
)
517 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
520 static void emit_sle( struct brw_vs_compile
*c
,
523 struct brw_reg arg1
)
525 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
528 static void emit_sgt( struct brw_vs_compile
*c
,
531 struct brw_reg arg1
)
533 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
536 static void emit_sge( struct brw_vs_compile
*c
,
539 struct brw_reg arg1
)
541 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
544 static void emit_cmp( struct brw_compile
*p
,
548 struct brw_reg arg2
)
550 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
551 brw_SEL(p
, dst
, arg1
, arg2
);
552 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
555 static void emit_sign(struct brw_vs_compile
*c
,
559 struct brw_compile
*p
= &c
->func
;
561 brw_MOV(p
, dst
, brw_imm_f(0));
563 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
564 brw_MOV(p
, dst
, brw_imm_f(-1.0));
565 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
567 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
568 brw_MOV(p
, dst
, brw_imm_f(1.0));
569 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
572 static void emit_max( struct brw_compile
*p
,
575 struct brw_reg arg1
)
577 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
578 brw_SEL(p
, dst
, arg0
, arg1
);
579 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
582 static void emit_min( struct brw_compile
*p
,
585 struct brw_reg arg1
)
587 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
588 brw_SEL(p
, dst
, arg0
, arg1
);
589 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
593 static void emit_math1( struct brw_vs_compile
*c
,
599 /* There are various odd behaviours with SEND on the simulator. In
600 * addition there are documented issues with the fact that the GEN4
601 * processor doesn't do dependency control properly on SEND
602 * results. So, on balance, this kludge to get around failures
603 * with writemasked math results looks like it might be necessary
604 * whether that turns out to be a simulator bug or not:
606 struct brw_compile
*p
= &c
->func
;
607 struct intel_context
*intel
= &p
->brw
->intel
;
608 struct brw_reg tmp
= dst
;
609 GLboolean need_tmp
= GL_FALSE
;
611 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
)
614 if (intel
->gen
< 6 && dst
.dw1
.bits
.writemask
!= 0xf)
623 BRW_MATH_SATURATE_NONE
,
626 BRW_MATH_DATA_SCALAR
,
630 brw_MOV(p
, dst
, tmp
);
636 static void emit_math2( struct brw_vs_compile
*c
,
643 struct brw_compile
*p
= &c
->func
;
644 struct intel_context
*intel
= &p
->brw
->intel
;
645 struct brw_reg tmp
= dst
;
646 GLboolean need_tmp
= GL_FALSE
;
648 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
)
651 if (intel
->gen
< 6 && dst
.dw1
.bits
.writemask
!= 0xf)
657 brw_MOV(p
, brw_message_reg(3), arg1
);
662 BRW_MATH_SATURATE_NONE
,
665 BRW_MATH_DATA_SCALAR
,
669 brw_MOV(p
, dst
, tmp
);
675 static void emit_exp_noalias( struct brw_vs_compile
*c
,
677 struct brw_reg arg0
)
679 struct brw_compile
*p
= &c
->func
;
682 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
683 struct brw_reg tmp
= get_tmp(c
);
684 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
686 /* tmp_d = floor(arg0.x) */
687 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
689 /* result[0] = 2.0 ^ tmp */
691 /* Adjust exponent for floating point:
694 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
696 /* Install exponent and sign.
697 * Excess drops off the edge:
699 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
700 tmp_d
, brw_imm_d(23));
705 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
706 /* result[1] = arg0.x - floor(arg0.x) */
707 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
710 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
711 /* As with the LOG instruction, we might be better off just
712 * doing a taylor expansion here, seeing as we have to do all
715 * If mathbox partial precision is too low, consider also:
716 * result[3] = result[0] * EXP(result[1])
719 BRW_MATH_FUNCTION_EXP
,
720 brw_writemask(dst
, WRITEMASK_Z
),
721 brw_swizzle1(arg0
, 0),
722 BRW_MATH_PRECISION_FULL
);
725 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
726 /* result[3] = 1.0; */
727 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
732 static void emit_log_noalias( struct brw_vs_compile
*c
,
734 struct brw_reg arg0
)
736 struct brw_compile
*p
= &c
->func
;
737 struct brw_reg tmp
= dst
;
738 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
739 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
740 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
741 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
745 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
748 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
751 * These almost look likey they could be joined up, but not really
754 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
755 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
757 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
759 brw_writemask(tmp_ud
, WRITEMASK_X
),
760 brw_swizzle1(arg0_ud
, 0),
761 brw_imm_ud((1U<<31)-1));
764 brw_writemask(tmp_ud
, WRITEMASK_X
),
769 brw_writemask(tmp
, WRITEMASK_X
),
770 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
774 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
776 brw_writemask(tmp_ud
, WRITEMASK_Y
),
777 brw_swizzle1(arg0_ud
, 0),
778 brw_imm_ud((1<<23)-1));
781 brw_writemask(tmp_ud
, WRITEMASK_Y
),
783 brw_imm_ud(127<<23));
786 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
787 /* result[2] = result[0] + LOG2(result[1]); */
789 /* Why bother? The above is just a hint how to do this with a
790 * taylor series. Maybe we *should* use a taylor series as by
791 * the time all the above has been done it's almost certainly
792 * quicker than calling the mathbox, even with low precision.
795 * - result[0] + mathbox.LOG2(result[1])
796 * - mathbox.LOG2(arg0.x)
797 * - result[0] + inline_taylor_approx(result[1])
800 BRW_MATH_FUNCTION_LOG
,
801 brw_writemask(tmp
, WRITEMASK_Z
),
802 brw_swizzle1(tmp
, 1),
803 BRW_MATH_PRECISION_FULL
);
806 brw_writemask(tmp
, WRITEMASK_Z
),
807 brw_swizzle1(tmp
, 2),
808 brw_swizzle1(tmp
, 0));
811 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
812 /* result[3] = 1.0; */
813 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
817 brw_MOV(p
, dst
, tmp
);
823 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
825 static void emit_dst_noalias( struct brw_vs_compile
*c
,
830 struct brw_compile
*p
= &c
->func
;
832 /* There must be a better way to do this:
834 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
835 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
836 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
837 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
838 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
839 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
840 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
841 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
845 static void emit_xpd( struct brw_compile
*p
,
850 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
851 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
855 static void emit_lit_noalias( struct brw_vs_compile
*c
,
857 struct brw_reg arg0
)
859 struct brw_compile
*p
= &c
->func
;
860 struct brw_instruction
*if_insn
;
861 struct brw_reg tmp
= dst
;
862 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
867 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
868 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
870 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
871 * to get all channels active inside the IF. In the clipping code
872 * we run with NoMask, so it's not an option and we can use
873 * BRW_EXECUTE_1 for all comparisions.
875 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
876 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
878 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
880 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
881 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
882 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
885 BRW_MATH_FUNCTION_POW
,
886 brw_writemask(dst
, WRITEMASK_Z
),
887 brw_swizzle1(tmp
, 2),
888 brw_swizzle1(arg0
, 3),
889 BRW_MATH_PRECISION_PARTIAL
);
892 brw_ENDIF(p
, if_insn
);
897 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
903 struct brw_compile
*p
= &c
->func
;
905 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
906 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
907 brw_MAC(p
, dst
, arg0
, arg1
);
910 /** 3 or 4-component vector normalization */
911 static void emit_nrm( struct brw_vs_compile
*c
,
916 struct brw_compile
*p
= &c
->func
;
917 struct brw_reg tmp
= get_tmp(c
);
919 /* tmp = dot(arg0, arg0) */
921 brw_DP3(p
, tmp
, arg0
, arg0
);
923 brw_DP4(p
, tmp
, arg0
, arg0
);
925 /* tmp = 1 / sqrt(tmp) */
926 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
928 /* dst = arg0 * tmp */
929 brw_MUL(p
, dst
, arg0
, tmp
);
935 static struct brw_reg
936 get_constant(struct brw_vs_compile
*c
,
937 const struct prog_instruction
*inst
,
940 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
941 struct brw_compile
*p
= &c
->func
;
942 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
944 assert(argIndex
< 3);
946 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
948 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
949 /* Keep track of the last constant loaded in this slot, for reuse. */
950 c
->current_const
[argIndex
].index
= src
->Index
;
953 printf(" fetch const[%d] for arg %d into reg %d\n",
954 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
956 /* need to fetch the constant now */
958 const_reg
, /* writeback dest */
959 16 * src
->Index
, /* byte offset */
960 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
964 /* replicate lower four floats into upper half (to get XYZWXYZW) */
965 const_reg
= stride(const_reg
, 0, 4, 0);
971 static struct brw_reg
972 get_reladdr_constant(struct brw_vs_compile
*c
,
973 const struct prog_instruction
*inst
,
976 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
977 struct brw_compile
*p
= &c
->func
;
978 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
979 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
980 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
982 assert(argIndex
< 3);
984 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
986 /* Can't reuse a reladdr constant load. */
987 c
->current_const
[argIndex
].index
= -1;
990 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
991 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
994 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
996 /* fetch the first vec4 */
997 brw_dp_READ_4_vs_relative(p
,
998 const_reg
, /* writeback dest */
999 byte_addr_reg
, /* address register */
1000 16 * src
->Index
, /* byte offset */
1001 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1009 /* TODO: relative addressing!
1011 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1012 gl_register_file file
,
1016 case PROGRAM_TEMPORARY
:
1018 case PROGRAM_OUTPUT
:
1019 assert(c
->regs
[file
][index
].nr
!= 0);
1020 return c
->regs
[file
][index
];
1021 case PROGRAM_STATE_VAR
:
1022 case PROGRAM_CONSTANT
:
1023 case PROGRAM_UNIFORM
:
1024 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1025 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1026 case PROGRAM_ADDRESS
:
1028 return c
->regs
[file
][index
];
1030 case PROGRAM_UNDEFINED
: /* undef values */
1031 return brw_null_reg();
1033 case PROGRAM_LOCAL_PARAM
:
1034 case PROGRAM_ENV_PARAM
:
1035 case PROGRAM_WRITE_ONLY
:
1038 return brw_null_reg();
1044 * Indirect addressing: get reg[[arg] + offset].
1046 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1051 struct brw_compile
*p
= &c
->func
;
1052 struct brw_reg tmp
= get_tmp(c
);
1053 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1054 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1055 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1056 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1057 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1059 /* Set the vertical stride on the register access so that the first
1060 * 4 components come from a0.0 and the second 4 from a0.1.
1062 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1065 brw_push_insn_state(p
);
1066 brw_set_access_mode(p
, BRW_ALIGN_1
);
1068 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1069 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1071 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1072 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1074 brw_MOV(p
, tmp
, indirect
);
1076 brw_pop_insn_state(p
);
1079 /* NOTE: tmp not released */
1084 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1085 const struct prog_instruction
*inst
,
1088 struct brw_compile
*p
= &c
->func
;
1090 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1091 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1092 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1093 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1094 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1095 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1097 /* Because destination register indirect addressing can only use
1098 * one index, we'll write each vertex's vec4 value separately.
1100 val
.width
= BRW_WIDTH_4
;
1101 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1103 brw_push_insn_state(p
);
1104 brw_set_access_mode(p
, BRW_ALIGN_1
);
1106 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1107 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1108 brw_MOV(p
, indirect
, val
);
1110 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1111 brw_ADD(p
, brw_address_reg(0), acc
,
1112 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1113 brw_MOV(p
, indirect
, suboffset(val
, 4));
1115 brw_pop_insn_state(p
);
1119 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1120 * TODO: relative addressing!
1122 static struct brw_reg
1123 get_src_reg( struct brw_vs_compile
*c
,
1124 const struct prog_instruction
*inst
,
1127 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1128 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1129 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1131 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1132 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1134 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1138 return brw_imm_f(0.0f
);
1139 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1144 return brw_imm_f(-1.0F
);
1146 return brw_imm_f(1.0F
);
1147 } else if (src
->File
== PROGRAM_CONSTANT
) {
1148 const struct gl_program_parameter_list
*params
;
1152 switch (src
->Swizzle
) {
1167 if (component
>= 0) {
1168 params
= c
->vp
->program
.Base
.Parameters
;
1169 f
= params
->ParameterValues
[src
->Index
][component
];
1175 return brw_imm_f(f
);
1181 case PROGRAM_TEMPORARY
:
1183 case PROGRAM_OUTPUT
:
1185 return deref(c
, c
->regs
[file
][0], index
, 32);
1188 assert(c
->regs
[file
][index
].nr
!= 0);
1189 return c
->regs
[file
][index
];
1192 case PROGRAM_STATE_VAR
:
1193 case PROGRAM_CONSTANT
:
1194 case PROGRAM_UNIFORM
:
1195 case PROGRAM_ENV_PARAM
:
1196 case PROGRAM_LOCAL_PARAM
:
1197 if (c
->vp
->use_const_buffer
) {
1198 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1199 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1200 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1202 return get_reladdr_constant(c
, inst
, argIndex
);
1204 return get_constant(c
, inst
, argIndex
);
1207 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1210 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1211 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1213 case PROGRAM_ADDRESS
:
1215 return c
->regs
[file
][index
];
1217 case PROGRAM_UNDEFINED
:
1218 /* this is a normal case since we loop over all three src args */
1219 return brw_null_reg();
1221 case PROGRAM_WRITE_ONLY
:
1224 return brw_null_reg();
1229 * Return the brw reg for the given instruction's src argument.
1230 * Will return mangled results for SWZ op. The emit_swz() function
1231 * ignores this result and recalculates taking extended swizzles into
1234 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1235 const struct prog_instruction
*inst
,
1238 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1241 if (src
->File
== PROGRAM_UNDEFINED
)
1242 return brw_null_reg();
1244 reg
= get_src_reg(c
, inst
, argIndex
);
1246 /* Convert 3-bit swizzle to 2-bit.
1248 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1249 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1250 GET_SWZ(src
->Swizzle
, 1),
1251 GET_SWZ(src
->Swizzle
, 2),
1252 GET_SWZ(src
->Swizzle
, 3));
1255 /* Note this is ok for non-swizzle instructions:
1257 reg
.negate
= src
->Negate
? 1 : 0;
1264 * Get brw register for the given program dest register.
1266 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1267 struct prog_dst_register dst
)
1272 case PROGRAM_TEMPORARY
:
1273 case PROGRAM_OUTPUT
:
1274 /* register-indirect addressing is only 1x1, not VxH, for
1275 * destination regs. So, for RelAddr we'll return a temporary
1276 * for the dest and do a move of the result to the RelAddr
1277 * register after the instruction emit.
1282 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1283 reg
= c
->regs
[dst
.File
][dst
.Index
];
1286 case PROGRAM_ADDRESS
:
1287 assert(dst
.Index
== 0);
1288 reg
= c
->regs
[dst
.File
][dst
.Index
];
1290 case PROGRAM_UNDEFINED
:
1291 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1292 reg
= brw_null_reg();
1296 reg
= brw_null_reg();
1299 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1300 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1306 static void emit_swz( struct brw_vs_compile
*c
,
1308 const struct prog_instruction
*inst
)
1310 const GLuint argIndex
= 0;
1311 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1312 struct brw_compile
*p
= &c
->func
;
1313 GLuint zeros_mask
= 0;
1314 GLuint ones_mask
= 0;
1315 GLuint src_mask
= 0;
1317 GLboolean need_tmp
= (src
.Negate
&&
1318 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1319 struct brw_reg tmp
= dst
;
1325 for (i
= 0; i
< 4; i
++) {
1326 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1327 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1346 /* Do src first, in case dst aliases src:
1349 struct brw_reg arg0
;
1351 arg0
= get_src_reg(c
, inst
, argIndex
);
1353 arg0
= brw_swizzle(arg0
,
1354 src_swz
[0], src_swz
[1],
1355 src_swz
[2], src_swz
[3]);
1357 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1361 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1364 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1367 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1370 brw_MOV(p
, dst
, tmp
);
1371 release_tmp(c
, tmp
);
1377 * Post-vertex-program processing. Send the results to the URB.
1379 static void emit_vertex_write( struct brw_vs_compile
*c
)
1381 struct brw_compile
*p
= &c
->func
;
1382 struct brw_context
*brw
= p
->brw
;
1383 struct intel_context
*intel
= &brw
->intel
;
1384 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1387 GLuint len_vertex_header
= 2;
1390 if (c
->key
.copy_edgeflag
) {
1392 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1393 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1396 if (intel
->gen
< 6) {
1397 /* Build ndc coords */
1399 /* ndc = 1.0 / pos.w */
1400 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1401 /* ndc.xyz = pos * ndc */
1402 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1405 /* Update the header for point size, user clipping flags, and -ve rhw
1408 if (intel
->gen
>= 6) {
1409 struct brw_reg m1
= brw_message_reg(1);
1411 /* On gen6, m1 has each value in a separate dword, so we never
1412 * need to mess with a temporary for computing the m1 value.
1414 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1415 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1416 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1417 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1420 /* Set the user clip distances in dword 8-15. (m3-4)*/
1421 if (c
->key
.nr_userclip
) {
1422 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1425 m
= brw_message_reg(3);
1427 m
= brw_message_reg(4);
1429 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 7))),pos
, c
->userplane
[i
]);
1432 } else if ((c
->prog_data
.outputs_written
&
1433 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1434 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1435 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1438 brw_MOV(p
, header1
, brw_imm_ud(0));
1440 brw_set_access_mode(p
, BRW_ALIGN_16
);
1442 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1443 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1444 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1445 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1446 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1447 header1
, brw_imm_ud(0x7ff<<8));
1450 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1451 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1452 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1453 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1454 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1457 /* i965 clipping workaround:
1458 * 1) Test for -ve rhw
1460 * set ndc = (0,0,0,0)
1463 * Later, clipping will detect ucp[6] and ensure the primitive is
1464 * clipped against all fixed planes.
1466 if (brw
->has_negative_rhw_bug
) {
1468 vec8(brw_null_reg()),
1470 brw_swizzle1(ndc
, 3),
1473 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1474 brw_MOV(p
, ndc
, brw_imm_f(0));
1475 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1478 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1479 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1480 brw_set_access_mode(p
, BRW_ALIGN_16
);
1482 release_tmp(c
, header1
);
1485 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1488 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1489 * of zeros followed by two sets of NDC coordinates:
1491 brw_set_access_mode(p
, BRW_ALIGN_1
);
1492 brw_set_acc_write_control(p
, 0);
1494 /* The VUE layout is documented in Volume 2a. */
1495 if (intel
->gen
>= 6) {
1496 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1497 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1498 * dword 4-7 (m2) is the 4D space position
1499 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1501 * m3 or 5 is the first vertex element data we fill, which is
1502 * the vertex position.
1504 brw_MOV(p
, brw_message_reg(2), pos
);
1505 len_vertex_header
= 1;
1506 if (c
->key
.nr_userclip
> 0)
1507 len_vertex_header
+= 2;
1508 } else if (intel
->gen
== 5) {
1509 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1510 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1511 * dword 4-7 (m2) is the ndc position (set above)
1512 * dword 8-11 (m3) of the vertex header is the 4D space position
1513 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1514 * m6 is a pad so that the vertex element data is aligned
1515 * m7 is the first vertex data we fill, which is the vertex position.
1517 brw_MOV(p
, brw_message_reg(2), ndc
);
1518 brw_MOV(p
, brw_message_reg(3), pos
);
1519 brw_MOV(p
, brw_message_reg(7), pos
);
1520 len_vertex_header
= 6;
1522 /* There are 8 dwords in VUE header pre-Ironlake:
1523 * dword 0-3 (m1) is indices, point width, clip flags.
1524 * dword 4-7 (m2) is ndc position (set above)
1526 * dword 8-11 (m3) is the first vertex data, which we always have be the
1529 brw_MOV(p
, brw_message_reg(2), ndc
);
1530 brw_MOV(p
, brw_message_reg(3), pos
);
1531 len_vertex_header
= 2;
1534 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1535 next_mrf
= 2 + len_vertex_header
;
1536 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1537 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1539 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1542 if (i
>= VERT_RESULT_TEX0
&&
1543 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1544 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1546 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1547 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1551 eot
= (c
->first_overflow_output
== 0);
1554 brw_null_reg(), /* dest */
1555 0, /* starting mrf reg nr */
1559 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1560 0, /* response len */
1562 eot
, /* writes complete */
1563 0, /* urb destination offset */
1564 BRW_URB_SWIZZLE_INTERLEAVE
);
1566 if (c
->first_overflow_output
> 0) {
1567 /* Not all of the vertex outputs/results fit into the MRF.
1568 * Move the overflowed attributes from the GRF to the MRF and
1569 * issue another brw_urb_WRITE().
1572 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1573 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1574 /* move from GRF to MRF */
1575 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1581 brw_null_reg(), /* dest */
1582 0, /* starting mrf reg nr */
1587 0, /* response len */
1589 1, /* writes complete */
1590 14 / 2, /* urb destination offset */
1591 BRW_URB_SWIZZLE_INTERLEAVE
);
1596 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1598 struct brw_compile
*p
= &c
->func
;
1599 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1601 if (p
->nr_insn
== 0)
1604 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1607 switch (prev_insn
->header
.opcode
) {
1608 case BRW_OPCODE_MOV
:
1609 case BRW_OPCODE_MAC
:
1610 case BRW_OPCODE_MUL
:
1611 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1612 prev_insn
->header
.execution_size
== val
.width
&&
1613 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1614 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1615 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1616 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1617 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1618 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1628 get_predicate(const struct prog_instruction
*inst
)
1630 if (inst
->DstReg
.CondMask
== COND_TR
)
1631 return BRW_PREDICATE_NONE
;
1633 /* All of GLSL only produces predicates for COND_NE and one channel per
1634 * vector. Fail badly if someone starts doing something else, as it might
1635 * mean infinite looping or something.
1637 * We'd like to support all the condition codes, but our hardware doesn't
1638 * quite match the Mesa IR, which is modeled after the NV extensions. For
1639 * those, the instruction may update the condition codes or not, then any
1640 * later instruction may use one of those condition codes. For gen4, the
1641 * instruction may update the flags register based on one of the condition
1642 * codes output by the instruction, and then further instructions may
1643 * predicate on that. We can probably support this, but it won't
1644 * necessarily be easy.
1646 assert(inst
->DstReg
.CondMask
== COND_NE
);
1648 switch (inst
->DstReg
.CondSwizzle
) {
1650 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1652 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1654 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1656 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1658 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1659 inst
->DstReg
.CondMask
);
1660 return BRW_PREDICATE_NORMAL
;
1664 /* Emit the vertex program instructions here.
1666 void brw_vs_emit(struct brw_vs_compile
*c
)
1668 #define MAX_IF_DEPTH 32
1669 #define MAX_LOOP_DEPTH 32
1670 struct brw_compile
*p
= &c
->func
;
1671 struct brw_context
*brw
= p
->brw
;
1672 struct intel_context
*intel
= &brw
->intel
;
1673 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1674 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1675 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1676 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1677 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1681 if (INTEL_DEBUG
& DEBUG_VS
) {
1682 printf("vs-mesa:\n");
1683 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1688 /* FIXME Need to fix conditional instruction to remove this */
1689 if (intel
->gen
>= 6)
1690 p
->single_program_flow
= GL_TRUE
;
1692 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1693 brw_set_access_mode(p
, BRW_ALIGN_16
);
1694 if_depth_in_loop
[loop_depth
] = 0;
1696 brw_set_acc_write_control(p
, 1);
1698 for (insn
= 0; insn
< nr_insns
; insn
++) {
1700 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1702 /* Message registers can't be read, so copy the output into GRF
1703 * register if they are used in source registers
1705 for (i
= 0; i
< 3; i
++) {
1706 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1707 GLuint index
= src
->Index
;
1708 GLuint file
= src
->File
;
1709 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1710 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1713 switch (inst
->Opcode
) {
1716 c
->needs_stack
= GL_TRUE
;
1723 /* Static register allocation
1725 brw_vs_alloc_regs(c
);
1728 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1730 for (insn
= 0; insn
< nr_insns
; insn
++) {
1732 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1733 struct brw_reg args
[3], dst
;
1737 printf("%d: ", insn
);
1738 _mesa_print_instruction(inst
);
1741 /* Get argument regs. SWZ is special and does this itself.
1743 if (inst
->Opcode
!= OPCODE_SWZ
)
1744 for (i
= 0; i
< 3; i
++) {
1745 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1748 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1749 args
[i
] = c
->output_regs
[index
].reg
;
1751 args
[i
] = get_arg(c
, inst
, i
);
1754 /* Get dest regs. Note that it is possible for a reg to be both
1755 * dst and arg, given the static allocation of registers. So
1756 * care needs to be taken emitting multi-operation instructions.
1758 index
= inst
->DstReg
.Index
;
1759 file
= inst
->DstReg
.File
;
1760 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1761 dst
= c
->output_regs
[index
].reg
;
1763 dst
= get_dst(c
, inst
->DstReg
);
1765 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1766 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1767 inst
->SaturateMode
);
1770 switch (inst
->Opcode
) {
1772 brw_MOV(p
, dst
, brw_abs(args
[0]));
1775 brw_ADD(p
, dst
, args
[0], args
[1]);
1778 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1781 brw_DP2(p
, dst
, args
[0], args
[1]);
1784 brw_DP3(p
, dst
, args
[0], args
[1]);
1787 brw_DP4(p
, dst
, args
[0], args
[1]);
1790 brw_DPH(p
, dst
, args
[0], args
[1]);
1793 emit_nrm(c
, dst
, args
[0], 3);
1796 emit_nrm(c
, dst
, args
[0], 4);
1799 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1802 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1805 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1808 brw_RNDD(p
, dst
, args
[0]);
1811 brw_RNDD(p
, dst
, args
[0]);
1814 brw_FRC(p
, dst
, args
[0]);
1817 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1820 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1823 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1826 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1829 if (!accumulator_contains(c
, args
[2]))
1830 brw_MOV(p
, brw_acc_reg(), args
[2]);
1831 brw_MAC(p
, dst
, args
[0], args
[1]);
1834 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1837 emit_max(p
, dst
, args
[0], args
[1]);
1840 emit_min(p
, dst
, args
[0], args
[1]);
1843 brw_MOV(p
, dst
, args
[0]);
1846 brw_MUL(p
, dst
, args
[0], args
[1]);
1849 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1852 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1855 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1859 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1862 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1865 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1868 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1871 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1874 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1877 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1880 unalias1(c
, dst
, args
[0], emit_sign
);
1883 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1886 /* The args[0] value can't be used here as it won't have
1887 * correctly encoded the full swizzle:
1889 emit_swz(c
, dst
, inst
);
1892 /* round toward zero */
1893 brw_RNDZ(p
, dst
, args
[0]);
1896 emit_xpd(p
, dst
, args
[0], args
[1]);
1899 assert(if_depth
< MAX_IF_DEPTH
);
1900 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1901 /* Note that brw_IF smashes the predicate_control field. */
1902 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1903 if_depth_in_loop
[loop_depth
]++;
1907 clear_current_const(c
);
1908 assert(if_depth
> 0);
1909 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1912 clear_current_const(c
);
1913 assert(if_depth
> 0);
1914 brw_ENDIF(p
, if_inst
[--if_depth
]);
1915 if_depth_in_loop
[loop_depth
]--;
1917 case OPCODE_BGNLOOP
:
1918 clear_current_const(c
);
1919 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1920 if_depth_in_loop
[loop_depth
] = 0;
1923 brw_set_predicate_control(p
, get_predicate(inst
));
1924 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
1925 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1928 brw_set_predicate_control(p
, get_predicate(inst
));
1929 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
1930 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1932 case OPCODE_ENDLOOP
:
1934 clear_current_const(c
);
1935 struct brw_instruction
*inst0
, *inst1
;
1940 if (intel
->gen
== 5)
1943 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1944 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1945 while (inst0
> loop_inst
[loop_depth
]) {
1947 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
1948 inst0
->bits3
.if_else
.jump_count
== 0) {
1949 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1951 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
1952 inst0
->bits3
.if_else
.jump_count
== 0) {
1953 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1959 brw_set_predicate_control(p
, get_predicate(inst
));
1960 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1961 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1964 brw_set_access_mode(p
, BRW_ALIGN_1
);
1965 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1966 brw_set_access_mode(p
, BRW_ALIGN_16
);
1967 brw_ADD(p
, get_addr_reg(stack_index
),
1968 get_addr_reg(stack_index
), brw_imm_d(4));
1969 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1970 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1973 brw_ADD(p
, get_addr_reg(stack_index
),
1974 get_addr_reg(stack_index
), brw_imm_d(-4));
1975 brw_set_access_mode(p
, BRW_ALIGN_1
);
1976 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1977 brw_set_access_mode(p
, BRW_ALIGN_16
);
1980 emit_vertex_write(c
);
1986 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1992 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1993 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1994 _mesa_opcode_string(inst
->Opcode
) :
1998 /* Set the predication update on the last instruction of the native
1999 * instruction sequence.
2001 * This would be problematic if it was set on a math instruction,
2002 * but that shouldn't be the case with the current GLSL compiler.
2004 if (inst
->CondUpdate
) {
2005 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2007 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2008 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2011 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2012 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2013 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2014 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2017 /* Result color clamping.
2019 * When destination register is an output register and
2020 * it's primary/secondary front/back color, we have to clamp
2021 * the result to [0,1]. This is done by enabling the
2022 * saturation bit for the last instruction.
2024 * We don't use brw_set_saturate() as it modifies
2025 * p->current->header.saturate, which affects all the subsequent
2026 * instructions. Instead, we directly modify the header
2027 * of the last (already stored) instruction.
2029 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
2030 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2031 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2032 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2033 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2034 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2038 if (inst
->DstReg
.RelAddr
) {
2039 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2040 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2041 move_to_reladdr_dst(c
, inst
, dst
);
2047 brw_resolve_cals(p
);
2051 if (INTEL_DEBUG
& DEBUG_VS
) {
2054 printf("vs-native:\n");
2055 for (i
= 0; i
< p
->nr_insn
; i
++)
2056 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);