2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
152 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
153 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
154 c
->vp
->use_const_buffer
= GL_TRUE
;
156 c
->vp
->use_const_buffer
= GL_FALSE
;
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
160 /* r0 -- reserved as usual
162 c
->r0
= brw_vec8_grf(reg
, 0);
165 /* User clip planes from curbe:
167 if (c
->key
.nr_userclip
) {
168 if (intel
->gen
>= 6) {
169 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
170 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
171 (i
% 2) * 4), 0, 4, 1);
173 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
175 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
176 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
177 (i
% 2) * 4), 0, 4, 1);
179 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
184 /* Vertex program parameters from curbe:
186 if (c
->vp
->use_const_buffer
) {
187 int max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
190 /* We've got more constants than we can load with the push
191 * mechanism. This is often correlated with reladdr loads where
192 * we should probably be using a pull mechanism anyway to avoid
193 * excessive reading. However, the pull mechanism is slow in
194 * general. So, we try to allocate as many non-reladdr-loaded
195 * constants through the push buffer as we can before giving up.
197 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
199 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
201 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
204 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
205 if ((inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
206 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
207 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
208 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
209 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) ||
210 inst
->SrcReg
[arg
].RelAddr
)
213 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
214 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
219 for (i
= 0; i
< constant
; i
++) {
220 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2,
224 reg
+= (constant
+ 1) / 2;
225 c
->prog_data
.curb_read_length
= reg
- 1;
226 /* XXX 0 causes a bug elsewhere... */
227 c
->prog_data
.nr_params
= MAX2(constant
* 4, 4);
230 /* use a section of the GRF for constants */
231 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
232 for (i
= 0; i
< nr_params
; i
++) {
233 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
235 reg
+= (nr_params
+ 1) / 2;
236 c
->prog_data
.curb_read_length
= reg
- 1;
238 c
->prog_data
.nr_params
= nr_params
* 4;
241 /* Allocate input regs:
244 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
245 if (c
->prog_data
.inputs_read
& (1 << i
)) {
247 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
251 /* If there are no inputs, we'll still be reading one attribute's worth
252 * because it's required -- see urb_read_length setting.
254 if (c
->nr_inputs
== 0)
257 /* Allocate outputs. The non-position outputs go straight into message regs.
260 c
->first_output
= reg
;
261 c
->first_overflow_output
= 0;
263 if (intel
->gen
>= 6) {
265 if (c
->key
.nr_userclip
)
267 } else if (intel
->gen
== 5)
272 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
273 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
274 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
276 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
277 if (i
== VERT_RESULT_HPOS
) {
278 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
281 else if (i
== VERT_RESULT_PSIZ
) {
282 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
284 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
287 /* Two restrictions on our compute-to-MRF here. The
288 * message length for all SEND messages is restricted to
289 * [1,15], so we can't use mrf 15, as that means a length
292 * Additionally, URB writes are aligned to URB rows, so we
293 * need to put an even number of registers of URB data in
294 * each URB write so that the later write is aligned. A
295 * message length of 15 means 1 message header reg plus 14
298 * For attributes beyond the compute-to-MRF, we compute to
299 * GRFs and they will be written in the second URB_WRITE.
301 if (first_reladdr_output
> i
&& mrf
< 15) {
302 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
306 if (mrf
>= 15 && !c
->first_overflow_output
)
307 c
->first_overflow_output
= i
;
308 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
316 /* Allocate program temporaries:
318 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
319 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
323 /* Address reg(s). Don't try to use the internal address reg until
326 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
327 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
331 BRW_VERTICAL_STRIDE_8
,
333 BRW_HORIZONTAL_STRIDE_1
,
339 if (c
->vp
->use_const_buffer
) {
340 for (i
= 0; i
< 3; i
++) {
341 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
344 clear_current_const(c
);
347 for (i
= 0; i
< 128; i
++) {
348 if (c
->output_regs
[i
].used_in_src
) {
349 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
354 if (c
->needs_stack
) {
355 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
359 /* Some opcodes need an internal temporary:
362 c
->last_tmp
= reg
; /* for allocation purposes */
364 /* Each input reg holds data from two vertices. The
365 * urb_read_length is the number of registers read from *each*
366 * vertex urb, so is half the amount:
368 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
369 /* Setting this field to 0 leads to undefined behavior according to the
370 * the VS_STATE docs. Our VUEs will always have at least one attribute
371 * sitting in them, even if it's padding.
373 if (c
->prog_data
.urb_read_length
== 0)
374 c
->prog_data
.urb_read_length
= 1;
376 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
377 * them to fit the biggest thing they need to.
379 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
381 /* See emit_vertex_write() for where the VUE's overhead on top of the
382 * attributes comes from.
384 if (intel
->gen
>= 6) {
386 if (c
->key
.nr_userclip
)
389 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
390 } else if (intel
->gen
== 5)
391 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
393 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
395 c
->prog_data
.total_grf
= reg
;
397 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
398 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
399 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
400 printf("%s reg = %d\n", __FUNCTION__
, reg
);
406 * If an instruction uses a temp reg both as a src and the dest, we
407 * sometimes need to allocate an intermediate temporary.
409 static void unalias1( struct brw_vs_compile
*c
,
412 void (*func
)( struct brw_vs_compile
*,
416 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
417 struct brw_compile
*p
= &c
->func
;
418 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
420 brw_MOV(p
, dst
, tmp
);
430 * Checkes if 2-operand instruction needs an intermediate temporary.
432 static void unalias2( struct brw_vs_compile
*c
,
436 void (*func
)( struct brw_vs_compile
*,
441 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
442 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
443 struct brw_compile
*p
= &c
->func
;
444 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
445 func(c
, tmp
, arg0
, arg1
);
446 brw_MOV(p
, dst
, tmp
);
450 func(c
, dst
, arg0
, arg1
);
456 * Checkes if 3-operand instruction needs an intermediate temporary.
458 static void unalias3( struct brw_vs_compile
*c
,
463 void (*func
)( struct brw_vs_compile
*,
469 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
470 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
471 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
472 struct brw_compile
*p
= &c
->func
;
473 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
474 func(c
, tmp
, arg0
, arg1
, arg2
);
475 brw_MOV(p
, dst
, tmp
);
479 func(c
, dst
, arg0
, arg1
, arg2
);
483 static void emit_sop( struct brw_vs_compile
*c
,
489 struct brw_compile
*p
= &c
->func
;
491 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
492 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
493 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
494 brw_set_predicate_control_flag_value(p
, 0xff);
497 static void emit_seq( struct brw_vs_compile
*c
,
500 struct brw_reg arg1
)
502 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
505 static void emit_sne( struct brw_vs_compile
*c
,
508 struct brw_reg arg1
)
510 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
512 static void emit_slt( struct brw_vs_compile
*c
,
515 struct brw_reg arg1
)
517 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
520 static void emit_sle( struct brw_vs_compile
*c
,
523 struct brw_reg arg1
)
525 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
528 static void emit_sgt( struct brw_vs_compile
*c
,
531 struct brw_reg arg1
)
533 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
536 static void emit_sge( struct brw_vs_compile
*c
,
539 struct brw_reg arg1
)
541 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
544 static void emit_cmp( struct brw_compile
*p
,
548 struct brw_reg arg2
)
550 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
551 brw_SEL(p
, dst
, arg1
, arg2
);
552 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
555 static void emit_sign(struct brw_vs_compile
*c
,
559 struct brw_compile
*p
= &c
->func
;
561 brw_MOV(p
, dst
, brw_imm_f(0));
563 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
564 brw_MOV(p
, dst
, brw_imm_f(-1.0));
565 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
567 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
568 brw_MOV(p
, dst
, brw_imm_f(1.0));
569 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
572 static void emit_max( struct brw_compile
*p
,
575 struct brw_reg arg1
)
577 struct intel_context
*intel
= &p
->brw
->intel
;
579 if (intel
->gen
>= 6) {
580 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
581 brw_SEL(p
, dst
, arg0
, arg1
);
582 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
583 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
585 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
586 brw_SEL(p
, dst
, arg0
, arg1
);
587 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
591 static void emit_min( struct brw_compile
*p
,
594 struct brw_reg arg1
)
596 struct intel_context
*intel
= &p
->brw
->intel
;
598 if (intel
->gen
>= 6) {
599 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
600 brw_SEL(p
, dst
, arg0
, arg1
);
601 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
602 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
604 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
605 brw_SEL(p
, dst
, arg0
, arg1
);
606 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
610 static void emit_math1_gen4(struct brw_vs_compile
*c
,
616 /* There are various odd behaviours with SEND on the simulator. In
617 * addition there are documented issues with the fact that the GEN4
618 * processor doesn't do dependency control properly on SEND
619 * results. So, on balance, this kludge to get around failures
620 * with writemasked math results looks like it might be necessary
621 * whether that turns out to be a simulator bug or not:
623 struct brw_compile
*p
= &c
->func
;
624 struct brw_reg tmp
= dst
;
625 GLboolean need_tmp
= GL_FALSE
;
627 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
628 dst
.dw1
.bits
.writemask
!= 0xf)
637 BRW_MATH_SATURATE_NONE
,
640 BRW_MATH_DATA_SCALAR
,
644 brw_MOV(p
, dst
, tmp
);
650 emit_math1_gen6(struct brw_vs_compile
*c
,
656 struct brw_compile
*p
= &c
->func
;
657 struct brw_reg tmp_src
, tmp_dst
;
659 /* Something is strange on gen6 math in 16-wide mode, though the
660 * docs say it's supposed to work. Punt to using align1 mode,
661 * which doesn't do writemasking and swizzles.
663 tmp_src
= get_tmp(c
);
664 tmp_dst
= get_tmp(c
);
666 brw_MOV(p
, tmp_src
, arg0
);
668 brw_set_access_mode(p
, BRW_ALIGN_1
);
672 BRW_MATH_SATURATE_NONE
,
675 BRW_MATH_DATA_SCALAR
,
677 brw_set_access_mode(p
, BRW_ALIGN_16
);
679 brw_MOV(p
, dst
, tmp_dst
);
681 release_tmp(c
, tmp_src
);
682 release_tmp(c
, tmp_dst
);
686 emit_math1(struct brw_vs_compile
*c
,
692 struct brw_compile
*p
= &c
->func
;
693 struct intel_context
*intel
= &p
->brw
->intel
;
696 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
698 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
701 static void emit_math2( struct brw_vs_compile
*c
,
708 struct brw_compile
*p
= &c
->func
;
709 struct intel_context
*intel
= &p
->brw
->intel
;
710 struct brw_reg tmp
= dst
;
711 GLboolean need_tmp
= GL_FALSE
;
713 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
)
716 if (intel
->gen
< 6 && dst
.dw1
.bits
.writemask
!= 0xf)
722 brw_MOV(p
, brw_message_reg(3), arg1
);
727 BRW_MATH_SATURATE_NONE
,
730 BRW_MATH_DATA_SCALAR
,
734 brw_MOV(p
, dst
, tmp
);
740 static void emit_exp_noalias( struct brw_vs_compile
*c
,
742 struct brw_reg arg0
)
744 struct brw_compile
*p
= &c
->func
;
747 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
748 struct brw_reg tmp
= get_tmp(c
);
749 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
751 /* tmp_d = floor(arg0.x) */
752 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
754 /* result[0] = 2.0 ^ tmp */
756 /* Adjust exponent for floating point:
759 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
761 /* Install exponent and sign.
762 * Excess drops off the edge:
764 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
765 tmp_d
, brw_imm_d(23));
770 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
771 /* result[1] = arg0.x - floor(arg0.x) */
772 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
775 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
776 /* As with the LOG instruction, we might be better off just
777 * doing a taylor expansion here, seeing as we have to do all
780 * If mathbox partial precision is too low, consider also:
781 * result[3] = result[0] * EXP(result[1])
784 BRW_MATH_FUNCTION_EXP
,
785 brw_writemask(dst
, WRITEMASK_Z
),
786 brw_swizzle1(arg0
, 0),
787 BRW_MATH_PRECISION_FULL
);
790 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
791 /* result[3] = 1.0; */
792 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
797 static void emit_log_noalias( struct brw_vs_compile
*c
,
799 struct brw_reg arg0
)
801 struct brw_compile
*p
= &c
->func
;
802 struct brw_reg tmp
= dst
;
803 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
804 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
805 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
806 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
810 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
813 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
816 * These almost look likey they could be joined up, but not really
819 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
820 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
822 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
824 brw_writemask(tmp_ud
, WRITEMASK_X
),
825 brw_swizzle1(arg0_ud
, 0),
826 brw_imm_ud((1U<<31)-1));
829 brw_writemask(tmp_ud
, WRITEMASK_X
),
834 brw_writemask(tmp
, WRITEMASK_X
),
835 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
839 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
841 brw_writemask(tmp_ud
, WRITEMASK_Y
),
842 brw_swizzle1(arg0_ud
, 0),
843 brw_imm_ud((1<<23)-1));
846 brw_writemask(tmp_ud
, WRITEMASK_Y
),
848 brw_imm_ud(127<<23));
851 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
852 /* result[2] = result[0] + LOG2(result[1]); */
854 /* Why bother? The above is just a hint how to do this with a
855 * taylor series. Maybe we *should* use a taylor series as by
856 * the time all the above has been done it's almost certainly
857 * quicker than calling the mathbox, even with low precision.
860 * - result[0] + mathbox.LOG2(result[1])
861 * - mathbox.LOG2(arg0.x)
862 * - result[0] + inline_taylor_approx(result[1])
865 BRW_MATH_FUNCTION_LOG
,
866 brw_writemask(tmp
, WRITEMASK_Z
),
867 brw_swizzle1(tmp
, 1),
868 BRW_MATH_PRECISION_FULL
);
871 brw_writemask(tmp
, WRITEMASK_Z
),
872 brw_swizzle1(tmp
, 2),
873 brw_swizzle1(tmp
, 0));
876 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
877 /* result[3] = 1.0; */
878 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
882 brw_MOV(p
, dst
, tmp
);
888 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
890 static void emit_dst_noalias( struct brw_vs_compile
*c
,
895 struct brw_compile
*p
= &c
->func
;
897 /* There must be a better way to do this:
899 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
900 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
901 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
902 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
903 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
904 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
905 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
906 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
910 static void emit_xpd( struct brw_compile
*p
,
915 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
916 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
920 static void emit_lit_noalias( struct brw_vs_compile
*c
,
922 struct brw_reg arg0
)
924 struct brw_compile
*p
= &c
->func
;
925 struct brw_instruction
*if_insn
;
926 struct brw_reg tmp
= dst
;
927 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
932 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
933 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
935 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
936 * to get all channels active inside the IF. In the clipping code
937 * we run with NoMask, so it's not an option and we can use
938 * BRW_EXECUTE_1 for all comparisions.
940 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
941 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
943 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
945 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
946 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
947 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
950 BRW_MATH_FUNCTION_POW
,
951 brw_writemask(dst
, WRITEMASK_Z
),
952 brw_swizzle1(tmp
, 2),
953 brw_swizzle1(arg0
, 3),
954 BRW_MATH_PRECISION_PARTIAL
);
957 brw_ENDIF(p
, if_insn
);
962 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
968 struct brw_compile
*p
= &c
->func
;
970 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
971 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
972 brw_MAC(p
, dst
, arg0
, arg1
);
975 /** 3 or 4-component vector normalization */
976 static void emit_nrm( struct brw_vs_compile
*c
,
981 struct brw_compile
*p
= &c
->func
;
982 struct brw_reg tmp
= get_tmp(c
);
984 /* tmp = dot(arg0, arg0) */
986 brw_DP3(p
, tmp
, arg0
, arg0
);
988 brw_DP4(p
, tmp
, arg0
, arg0
);
990 /* tmp = 1 / sqrt(tmp) */
991 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
993 /* dst = arg0 * tmp */
994 brw_MUL(p
, dst
, arg0
, tmp
);
1000 static struct brw_reg
1001 get_constant(struct brw_vs_compile
*c
,
1002 const struct prog_instruction
*inst
,
1005 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1006 struct brw_compile
*p
= &c
->func
;
1007 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1009 assert(argIndex
< 3);
1011 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
1013 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1014 /* Keep track of the last constant loaded in this slot, for reuse. */
1015 c
->current_const
[argIndex
].index
= src
->Index
;
1018 printf(" fetch const[%d] for arg %d into reg %d\n",
1019 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1021 /* need to fetch the constant now */
1023 const_reg
, /* writeback dest */
1024 16 * src
->Index
, /* byte offset */
1025 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1029 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1030 const_reg
= stride(const_reg
, 0, 4, 0);
1031 const_reg
.subnr
= 0;
1036 static struct brw_reg
1037 get_reladdr_constant(struct brw_vs_compile
*c
,
1038 const struct prog_instruction
*inst
,
1041 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1042 struct brw_compile
*p
= &c
->func
;
1043 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1044 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
1045 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1047 assert(argIndex
< 3);
1049 assert(c
->func
.brw
->intel
.gen
< 6); /* FINISHME */
1051 /* Can't reuse a reladdr constant load. */
1052 c
->current_const
[argIndex
].index
= -1;
1055 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1056 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1059 brw_MUL(p
, byte_addr_reg
, addrReg
, brw_imm_ud(16));
1061 /* fetch the first vec4 */
1062 brw_dp_READ_4_vs_relative(p
,
1063 const_reg
, /* writeback dest */
1064 byte_addr_reg
, /* address register */
1065 16 * src
->Index
, /* byte offset */
1066 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1074 /* TODO: relative addressing!
1076 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1077 gl_register_file file
,
1081 case PROGRAM_TEMPORARY
:
1083 case PROGRAM_OUTPUT
:
1084 assert(c
->regs
[file
][index
].nr
!= 0);
1085 return c
->regs
[file
][index
];
1086 case PROGRAM_STATE_VAR
:
1087 case PROGRAM_CONSTANT
:
1088 case PROGRAM_UNIFORM
:
1089 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1090 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1091 case PROGRAM_ADDRESS
:
1093 return c
->regs
[file
][index
];
1095 case PROGRAM_UNDEFINED
: /* undef values */
1096 return brw_null_reg();
1098 case PROGRAM_LOCAL_PARAM
:
1099 case PROGRAM_ENV_PARAM
:
1100 case PROGRAM_WRITE_ONLY
:
1103 return brw_null_reg();
1109 * Indirect addressing: get reg[[arg] + offset].
1111 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1116 struct brw_compile
*p
= &c
->func
;
1117 struct brw_reg tmp
= get_tmp(c
);
1118 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1119 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1120 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1121 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1122 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1124 /* Set the vertical stride on the register access so that the first
1125 * 4 components come from a0.0 and the second 4 from a0.1.
1127 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1130 brw_push_insn_state(p
);
1131 brw_set_access_mode(p
, BRW_ALIGN_1
);
1133 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1134 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1136 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1137 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1139 brw_MOV(p
, tmp
, indirect
);
1141 brw_pop_insn_state(p
);
1144 /* NOTE: tmp not released */
1149 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1150 const struct prog_instruction
*inst
,
1153 struct brw_compile
*p
= &c
->func
;
1155 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1156 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1157 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1158 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1159 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1160 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1162 /* Because destination register indirect addressing can only use
1163 * one index, we'll write each vertex's vec4 value separately.
1165 val
.width
= BRW_WIDTH_4
;
1166 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1168 brw_push_insn_state(p
);
1169 brw_set_access_mode(p
, BRW_ALIGN_1
);
1171 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1172 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1173 brw_MOV(p
, indirect
, val
);
1175 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1176 brw_ADD(p
, brw_address_reg(0), acc
,
1177 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1178 brw_MOV(p
, indirect
, suboffset(val
, 4));
1180 brw_pop_insn_state(p
);
1184 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1185 * TODO: relative addressing!
1187 static struct brw_reg
1188 get_src_reg( struct brw_vs_compile
*c
,
1189 const struct prog_instruction
*inst
,
1192 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1193 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1194 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1196 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1197 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1199 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1203 return brw_imm_f(0.0f
);
1204 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1209 return brw_imm_f(-1.0F
);
1211 return brw_imm_f(1.0F
);
1212 } else if (src
->File
== PROGRAM_CONSTANT
) {
1213 const struct gl_program_parameter_list
*params
;
1217 switch (src
->Swizzle
) {
1232 if (component
>= 0) {
1233 params
= c
->vp
->program
.Base
.Parameters
;
1234 f
= params
->ParameterValues
[src
->Index
][component
];
1240 return brw_imm_f(f
);
1246 case PROGRAM_TEMPORARY
:
1248 case PROGRAM_OUTPUT
:
1250 return deref(c
, c
->regs
[file
][0], index
, 32);
1253 assert(c
->regs
[file
][index
].nr
!= 0);
1254 return c
->regs
[file
][index
];
1257 case PROGRAM_STATE_VAR
:
1258 case PROGRAM_CONSTANT
:
1259 case PROGRAM_UNIFORM
:
1260 case PROGRAM_ENV_PARAM
:
1261 case PROGRAM_LOCAL_PARAM
:
1262 if (c
->vp
->use_const_buffer
) {
1263 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1264 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1265 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1267 return get_reladdr_constant(c
, inst
, argIndex
);
1269 return get_constant(c
, inst
, argIndex
);
1272 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
, 16);
1275 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1276 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1278 case PROGRAM_ADDRESS
:
1280 return c
->regs
[file
][index
];
1282 case PROGRAM_UNDEFINED
:
1283 /* this is a normal case since we loop over all three src args */
1284 return brw_null_reg();
1286 case PROGRAM_WRITE_ONLY
:
1289 return brw_null_reg();
1294 * Return the brw reg for the given instruction's src argument.
1295 * Will return mangled results for SWZ op. The emit_swz() function
1296 * ignores this result and recalculates taking extended swizzles into
1299 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1300 const struct prog_instruction
*inst
,
1303 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1306 if (src
->File
== PROGRAM_UNDEFINED
)
1307 return brw_null_reg();
1309 reg
= get_src_reg(c
, inst
, argIndex
);
1311 /* Convert 3-bit swizzle to 2-bit.
1313 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1314 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1315 GET_SWZ(src
->Swizzle
, 1),
1316 GET_SWZ(src
->Swizzle
, 2),
1317 GET_SWZ(src
->Swizzle
, 3));
1320 /* Note this is ok for non-swizzle instructions:
1322 reg
.negate
= src
->Negate
? 1 : 0;
1329 * Get brw register for the given program dest register.
1331 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1332 struct prog_dst_register dst
)
1337 case PROGRAM_TEMPORARY
:
1338 case PROGRAM_OUTPUT
:
1339 /* register-indirect addressing is only 1x1, not VxH, for
1340 * destination regs. So, for RelAddr we'll return a temporary
1341 * for the dest and do a move of the result to the RelAddr
1342 * register after the instruction emit.
1347 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1348 reg
= c
->regs
[dst
.File
][dst
.Index
];
1351 case PROGRAM_ADDRESS
:
1352 assert(dst
.Index
== 0);
1353 reg
= c
->regs
[dst
.File
][dst
.Index
];
1355 case PROGRAM_UNDEFINED
:
1356 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1357 reg
= brw_null_reg();
1361 reg
= brw_null_reg();
1364 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1365 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1371 static void emit_swz( struct brw_vs_compile
*c
,
1373 const struct prog_instruction
*inst
)
1375 const GLuint argIndex
= 0;
1376 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1377 struct brw_compile
*p
= &c
->func
;
1378 GLuint zeros_mask
= 0;
1379 GLuint ones_mask
= 0;
1380 GLuint src_mask
= 0;
1382 GLboolean need_tmp
= (src
.Negate
&&
1383 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1384 struct brw_reg tmp
= dst
;
1390 for (i
= 0; i
< 4; i
++) {
1391 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1392 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1411 /* Do src first, in case dst aliases src:
1414 struct brw_reg arg0
;
1416 arg0
= get_src_reg(c
, inst
, argIndex
);
1418 arg0
= brw_swizzle(arg0
,
1419 src_swz
[0], src_swz
[1],
1420 src_swz
[2], src_swz
[3]);
1422 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1426 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1429 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1432 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1435 brw_MOV(p
, dst
, tmp
);
1436 release_tmp(c
, tmp
);
1442 * Post-vertex-program processing. Send the results to the URB.
1444 static void emit_vertex_write( struct brw_vs_compile
*c
)
1446 struct brw_compile
*p
= &c
->func
;
1447 struct brw_context
*brw
= p
->brw
;
1448 struct intel_context
*intel
= &brw
->intel
;
1449 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1452 GLuint len_vertex_header
= 2;
1455 if (c
->key
.copy_edgeflag
) {
1457 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1458 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1461 if (intel
->gen
< 6) {
1462 /* Build ndc coords */
1464 /* ndc = 1.0 / pos.w */
1465 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1466 /* ndc.xyz = pos * ndc */
1467 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1470 /* Update the header for point size, user clipping flags, and -ve rhw
1473 if (intel
->gen
>= 6) {
1474 struct brw_reg m1
= brw_message_reg(1);
1476 /* On gen6, m1 has each value in a separate dword, so we never
1477 * need to mess with a temporary for computing the m1 value.
1479 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1480 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1481 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1482 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1485 /* Set the user clip distances in dword 8-15. (m3-4)*/
1486 if (c
->key
.nr_userclip
) {
1487 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1490 m
= brw_message_reg(3);
1492 m
= brw_message_reg(4);
1494 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 7))),pos
, c
->userplane
[i
]);
1497 } else if ((c
->prog_data
.outputs_written
&
1498 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1499 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1500 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1503 brw_MOV(p
, header1
, brw_imm_ud(0));
1505 brw_set_access_mode(p
, BRW_ALIGN_16
);
1507 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1508 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1509 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1510 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1511 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1512 header1
, brw_imm_ud(0x7ff<<8));
1515 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1516 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1517 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1518 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1519 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1522 /* i965 clipping workaround:
1523 * 1) Test for -ve rhw
1525 * set ndc = (0,0,0,0)
1528 * Later, clipping will detect ucp[6] and ensure the primitive is
1529 * clipped against all fixed planes.
1531 if (brw
->has_negative_rhw_bug
) {
1533 vec8(brw_null_reg()),
1535 brw_swizzle1(ndc
, 3),
1538 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1539 brw_MOV(p
, ndc
, brw_imm_f(0));
1540 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1543 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1544 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1545 brw_set_access_mode(p
, BRW_ALIGN_16
);
1547 release_tmp(c
, header1
);
1550 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1553 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1554 * of zeros followed by two sets of NDC coordinates:
1556 brw_set_access_mode(p
, BRW_ALIGN_1
);
1557 brw_set_acc_write_control(p
, 0);
1559 /* The VUE layout is documented in Volume 2a. */
1560 if (intel
->gen
>= 6) {
1561 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1562 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1563 * dword 4-7 (m2) is the 4D space position
1564 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1566 * m3 or 5 is the first vertex element data we fill, which is
1567 * the vertex position.
1569 brw_MOV(p
, brw_message_reg(2), pos
);
1570 len_vertex_header
= 1;
1571 if (c
->key
.nr_userclip
> 0)
1572 len_vertex_header
+= 2;
1573 } else if (intel
->gen
== 5) {
1574 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1575 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1576 * dword 4-7 (m2) is the ndc position (set above)
1577 * dword 8-11 (m3) of the vertex header is the 4D space position
1578 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1579 * m6 is a pad so that the vertex element data is aligned
1580 * m7 is the first vertex data we fill, which is the vertex position.
1582 brw_MOV(p
, brw_message_reg(2), ndc
);
1583 brw_MOV(p
, brw_message_reg(3), pos
);
1584 brw_MOV(p
, brw_message_reg(7), pos
);
1585 len_vertex_header
= 6;
1587 /* There are 8 dwords in VUE header pre-Ironlake:
1588 * dword 0-3 (m1) is indices, point width, clip flags.
1589 * dword 4-7 (m2) is ndc position (set above)
1591 * dword 8-11 (m3) is the first vertex data, which we always have be the
1594 brw_MOV(p
, brw_message_reg(2), ndc
);
1595 brw_MOV(p
, brw_message_reg(3), pos
);
1596 len_vertex_header
= 2;
1599 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1600 next_mrf
= 2 + len_vertex_header
;
1601 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1602 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1604 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1607 if (i
>= VERT_RESULT_TEX0
&&
1608 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1609 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1611 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1612 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1616 eot
= (c
->first_overflow_output
== 0);
1619 brw_null_reg(), /* dest */
1620 0, /* starting mrf reg nr */
1624 MIN2(c
->nr_outputs
+ 1 + len_vertex_header
, (BRW_MAX_MRF
-1)), /* msg len */
1625 0, /* response len */
1627 eot
, /* writes complete */
1628 0, /* urb destination offset */
1629 BRW_URB_SWIZZLE_INTERLEAVE
);
1631 if (c
->first_overflow_output
> 0) {
1632 /* Not all of the vertex outputs/results fit into the MRF.
1633 * Move the overflowed attributes from the GRF to the MRF and
1634 * issue another brw_urb_WRITE().
1637 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1638 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1639 /* move from GRF to MRF */
1640 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1646 brw_null_reg(), /* dest */
1647 0, /* starting mrf reg nr */
1652 0, /* response len */
1654 1, /* writes complete */
1655 14 / 2, /* urb destination offset */
1656 BRW_URB_SWIZZLE_INTERLEAVE
);
1661 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1663 struct brw_compile
*p
= &c
->func
;
1664 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1666 if (p
->nr_insn
== 0)
1669 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1672 switch (prev_insn
->header
.opcode
) {
1673 case BRW_OPCODE_MOV
:
1674 case BRW_OPCODE_MAC
:
1675 case BRW_OPCODE_MUL
:
1676 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1677 prev_insn
->header
.execution_size
== val
.width
&&
1678 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1679 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1680 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1681 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1682 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1683 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1693 get_predicate(const struct prog_instruction
*inst
)
1695 if (inst
->DstReg
.CondMask
== COND_TR
)
1696 return BRW_PREDICATE_NONE
;
1698 /* All of GLSL only produces predicates for COND_NE and one channel per
1699 * vector. Fail badly if someone starts doing something else, as it might
1700 * mean infinite looping or something.
1702 * We'd like to support all the condition codes, but our hardware doesn't
1703 * quite match the Mesa IR, which is modeled after the NV extensions. For
1704 * those, the instruction may update the condition codes or not, then any
1705 * later instruction may use one of those condition codes. For gen4, the
1706 * instruction may update the flags register based on one of the condition
1707 * codes output by the instruction, and then further instructions may
1708 * predicate on that. We can probably support this, but it won't
1709 * necessarily be easy.
1711 assert(inst
->DstReg
.CondMask
== COND_NE
);
1713 switch (inst
->DstReg
.CondSwizzle
) {
1715 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1717 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1719 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1721 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1723 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1724 inst
->DstReg
.CondMask
);
1725 return BRW_PREDICATE_NORMAL
;
1729 /* Emit the vertex program instructions here.
1731 void brw_vs_emit(struct brw_vs_compile
*c
)
1733 #define MAX_IF_DEPTH 32
1734 #define MAX_LOOP_DEPTH 32
1735 struct brw_compile
*p
= &c
->func
;
1736 struct brw_context
*brw
= p
->brw
;
1737 struct intel_context
*intel
= &brw
->intel
;
1738 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1739 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1740 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1741 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1742 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1746 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1747 printf("vs-mesa:\n");
1748 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1753 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1754 brw_set_access_mode(p
, BRW_ALIGN_16
);
1755 if_depth_in_loop
[loop_depth
] = 0;
1757 brw_set_acc_write_control(p
, 1);
1759 for (insn
= 0; insn
< nr_insns
; insn
++) {
1761 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1763 /* Message registers can't be read, so copy the output into GRF
1764 * register if they are used in source registers
1766 for (i
= 0; i
< 3; i
++) {
1767 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1768 GLuint index
= src
->Index
;
1769 GLuint file
= src
->File
;
1770 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1771 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1774 switch (inst
->Opcode
) {
1777 c
->needs_stack
= GL_TRUE
;
1784 /* Static register allocation
1786 brw_vs_alloc_regs(c
);
1789 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1791 for (insn
= 0; insn
< nr_insns
; insn
++) {
1793 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1794 struct brw_reg args
[3], dst
;
1798 printf("%d: ", insn
);
1799 _mesa_print_instruction(inst
);
1802 /* Get argument regs. SWZ is special and does this itself.
1804 if (inst
->Opcode
!= OPCODE_SWZ
)
1805 for (i
= 0; i
< 3; i
++) {
1806 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1809 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1810 args
[i
] = c
->output_regs
[index
].reg
;
1812 args
[i
] = get_arg(c
, inst
, i
);
1815 /* Get dest regs. Note that it is possible for a reg to be both
1816 * dst and arg, given the static allocation of registers. So
1817 * care needs to be taken emitting multi-operation instructions.
1819 index
= inst
->DstReg
.Index
;
1820 file
= inst
->DstReg
.File
;
1821 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1822 dst
= c
->output_regs
[index
].reg
;
1824 dst
= get_dst(c
, inst
->DstReg
);
1826 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1827 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1828 inst
->SaturateMode
);
1831 switch (inst
->Opcode
) {
1833 brw_MOV(p
, dst
, brw_abs(args
[0]));
1836 brw_ADD(p
, dst
, args
[0], args
[1]);
1839 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1842 brw_DP2(p
, dst
, args
[0], args
[1]);
1845 brw_DP3(p
, dst
, args
[0], args
[1]);
1848 brw_DP4(p
, dst
, args
[0], args
[1]);
1851 brw_DPH(p
, dst
, args
[0], args
[1]);
1854 emit_nrm(c
, dst
, args
[0], 3);
1857 emit_nrm(c
, dst
, args
[0], 4);
1860 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1863 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1866 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1869 brw_RNDD(p
, dst
, args
[0]);
1872 brw_RNDD(p
, dst
, args
[0]);
1875 brw_FRC(p
, dst
, args
[0]);
1878 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1881 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1884 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1887 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1890 if (!accumulator_contains(c
, args
[2]))
1891 brw_MOV(p
, brw_acc_reg(), args
[2]);
1892 brw_MAC(p
, dst
, args
[0], args
[1]);
1895 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
1898 emit_max(p
, dst
, args
[0], args
[1]);
1901 emit_min(p
, dst
, args
[0], args
[1]);
1904 brw_MOV(p
, dst
, args
[0]);
1907 brw_MUL(p
, dst
, args
[0], args
[1]);
1910 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1913 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1916 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1920 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
1923 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1926 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
1929 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
1932 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
1935 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
1938 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
1941 unalias1(c
, dst
, args
[0], emit_sign
);
1944 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1947 /* The args[0] value can't be used here as it won't have
1948 * correctly encoded the full swizzle:
1950 emit_swz(c
, dst
, inst
);
1953 /* round toward zero */
1954 brw_RNDZ(p
, dst
, args
[0]);
1957 emit_xpd(p
, dst
, args
[0], args
[1]);
1960 assert(if_depth
< MAX_IF_DEPTH
);
1961 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1962 /* Note that brw_IF smashes the predicate_control field. */
1963 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
1964 if_depth_in_loop
[loop_depth
]++;
1968 clear_current_const(c
);
1969 assert(if_depth
> 0);
1970 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1973 clear_current_const(c
);
1974 assert(if_depth
> 0);
1975 brw_ENDIF(p
, if_inst
[--if_depth
]);
1976 if_depth_in_loop
[loop_depth
]--;
1978 case OPCODE_BGNLOOP
:
1979 clear_current_const(c
);
1980 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1981 if_depth_in_loop
[loop_depth
] = 0;
1984 brw_set_predicate_control(p
, get_predicate(inst
));
1985 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
1986 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1989 brw_set_predicate_control(p
, get_predicate(inst
));
1990 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
1991 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1993 case OPCODE_ENDLOOP
:
1995 clear_current_const(c
);
1996 struct brw_instruction
*inst0
, *inst1
;
2001 if (intel
->gen
== 5)
2004 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2005 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2006 while (inst0
> loop_inst
[loop_depth
]) {
2008 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2009 inst0
->bits3
.if_else
.jump_count
== 0) {
2010 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2012 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2013 inst0
->bits3
.if_else
.jump_count
== 0) {
2014 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2020 brw_set_predicate_control(p
, get_predicate(inst
));
2021 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2022 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2025 brw_set_access_mode(p
, BRW_ALIGN_1
);
2026 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2027 brw_set_access_mode(p
, BRW_ALIGN_16
);
2028 brw_ADD(p
, get_addr_reg(stack_index
),
2029 get_addr_reg(stack_index
), brw_imm_d(4));
2030 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2031 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2034 brw_ADD(p
, get_addr_reg(stack_index
),
2035 get_addr_reg(stack_index
), brw_imm_d(-4));
2036 brw_set_access_mode(p
, BRW_ALIGN_1
);
2037 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2038 brw_set_access_mode(p
, BRW_ALIGN_16
);
2041 emit_vertex_write(c
);
2047 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2053 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2054 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2055 _mesa_opcode_string(inst
->Opcode
) :
2059 /* Set the predication update on the last instruction of the native
2060 * instruction sequence.
2062 * This would be problematic if it was set on a math instruction,
2063 * but that shouldn't be the case with the current GLSL compiler.
2065 if (inst
->CondUpdate
) {
2066 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2068 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2069 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2072 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2073 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2074 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2075 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2078 /* Result color clamping.
2080 * When destination register is an output register and
2081 * it's primary/secondary front/back color, we have to clamp
2082 * the result to [0,1]. This is done by enabling the
2083 * saturation bit for the last instruction.
2085 * We don't use brw_set_saturate() as it modifies
2086 * p->current->header.saturate, which affects all the subsequent
2087 * instructions. Instead, we directly modify the header
2088 * of the last (already stored) instruction.
2090 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
2091 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2092 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2093 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2094 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2095 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2099 if (inst
->DstReg
.RelAddr
) {
2100 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2101 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2102 move_to_reladdr_dst(c
, inst
, dst
);
2108 brw_resolve_cals(p
);
2112 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2115 printf("vs-native:\n");
2116 for (i
= 0; i
< p
->nr_insn
; i
++)
2117 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);