2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
, j
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
148 int vert_result_reoder
[VERT_RESULT_MAX
];
151 /* Determine whether to use a real constant buffer or use a block
152 * of GRF registers for constants. The later is faster but only
153 * works if everything fits in the GRF.
154 * XXX this heuristic/check may need some fine tuning...
156 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
157 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
158 c
->vp
->use_const_buffer
= GL_TRUE
;
160 c
->vp
->use_const_buffer
= GL_FALSE
;
162 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
164 /* r0 -- reserved as usual
166 c
->r0
= brw_vec8_grf(reg
, 0);
169 /* User clip planes from curbe:
171 if (c
->key
.nr_userclip
) {
172 if (intel
->gen
>= 6) {
173 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
174 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
175 (i
% 2) * 4), 0, 4, 1);
177 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
179 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
180 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
181 (i
% 2) * 4), 0, 4, 1);
183 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
188 /* Assign some (probably all) of the vertex program constants to
189 * the push constant buffer/CURBE.
191 * There's an obvious limit to the numer of push constants equal to
192 * the number of register available, and that number is smaller
193 * than the minimum maximum number of vertex program parameters, so
194 * support for pull constants is required if we overflow.
195 * Additionally, on gen6 the number of push constants is even
198 * When there's relative addressing, we don't know what range of
199 * Mesa IR registers can be accessed. And generally, when relative
200 * addressing is used we also have too many constants to load them
201 * all as push constants. So, we'll just support relative
202 * addressing out of the pull constant buffers, and try to load as
203 * many statically-accessed constants into the push constant buffer
206 if (intel
->gen
>= 6) {
207 /* We can only load 32 regs of push constants. */
208 max_constant
= 32 * 2 - c
->key
.nr_userclip
;
210 max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
213 /* constant_map maps from ParameterValues[] index to index in the
214 * push constant buffer, or -1 if it's only in the pull constant
217 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
219 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
221 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
224 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
225 if (inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
226 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
227 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
228 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
229 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) {
233 if (inst
->SrcReg
[arg
].RelAddr
) {
234 c
->vp
->use_const_buffer
= GL_TRUE
;
238 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
239 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
244 /* If we ran out of push constant space, then we'll also upload all
245 * constants through the pull constant buffer so that they can be
246 * accessed no matter what. For relative addressing (the common
247 * case) we need them all in place anyway.
249 if (constant
== max_constant
)
250 c
->vp
->use_const_buffer
= GL_TRUE
;
252 for (i
= 0; i
< constant
; i
++) {
253 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
257 reg
+= (constant
+ 1) / 2;
258 c
->prog_data
.curb_read_length
= reg
- 1;
259 c
->prog_data
.nr_params
= constant
* 4;
260 /* XXX 0 causes a bug elsewhere... */
261 if (intel
->gen
< 6 && c
->prog_data
.nr_params
== 0)
262 c
->prog_data
.nr_params
= 4;
264 /* Allocate input regs:
267 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
268 if (c
->prog_data
.inputs_read
& (1 << i
)) {
270 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
274 /* If there are no inputs, we'll still be reading one attribute's worth
275 * because it's required -- see urb_read_length setting.
277 if (c
->nr_inputs
== 0)
280 /* Allocate outputs. The non-position outputs go straight into message regs.
283 c
->first_output
= reg
;
284 c
->first_overflow_output
= 0;
286 if (intel
->gen
>= 6) {
288 if (c
->key
.nr_userclip
)
290 } else if (intel
->gen
== 5)
295 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
297 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
298 vert_result_reoder
[i
] = i
;
300 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
301 if (intel
->gen
>= 6 && c
->key
.two_side_color
) {
302 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL1
)) &&
303 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC1
))) {
304 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
));
305 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
));
307 } else if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
)) &&
308 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
)))
312 for (i
= 0; i
< bfc
; i
++) {
313 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 0] = VERT_RESULT_COL0
+ i
;
314 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 1] = VERT_RESULT_BFC0
+ i
;
317 for (i
= VERT_RESULT_COL0
+ bfc
* 2; i
< VERT_RESULT_BFC0
+ bfc
; i
++) {
318 vert_result_reoder
[i
] = i
- bfc
;
323 for (j
= 0; j
< VERT_RESULT_MAX
; j
++) {
324 i
= vert_result_reoder
[j
];
326 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
328 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
329 if (i
== VERT_RESULT_HPOS
) {
330 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
333 else if (i
== VERT_RESULT_PSIZ
) {
334 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
338 /* Two restrictions on our compute-to-MRF here. The
339 * message length for all SEND messages is restricted to
340 * [1,15], so we can't use mrf 15, as that means a length
343 * Additionally, URB writes are aligned to URB rows, so we
344 * need to put an even number of registers of URB data in
345 * each URB write so that the later write is aligned. A
346 * message length of 15 means 1 message header reg plus 14
349 * For attributes beyond the compute-to-MRF, we compute to
350 * GRFs and they will be written in the second URB_WRITE.
352 if (first_reladdr_output
> i
&& mrf
< 15) {
353 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
357 if (mrf
>= 15 && !c
->first_overflow_output
)
358 c
->first_overflow_output
= i
;
359 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
367 /* Allocate program temporaries:
369 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
370 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
374 /* Address reg(s). Don't try to use the internal address reg until
377 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
378 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
382 BRW_VERTICAL_STRIDE_8
,
384 BRW_HORIZONTAL_STRIDE_1
,
390 if (c
->vp
->use_const_buffer
) {
391 for (i
= 0; i
< 3; i
++) {
392 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
395 clear_current_const(c
);
398 for (i
= 0; i
< 128; i
++) {
399 if (c
->output_regs
[i
].used_in_src
) {
400 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
405 if (c
->needs_stack
) {
406 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
410 /* Some opcodes need an internal temporary:
413 c
->last_tmp
= reg
; /* for allocation purposes */
415 /* Each input reg holds data from two vertices. The
416 * urb_read_length is the number of registers read from *each*
417 * vertex urb, so is half the amount:
419 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
420 /* Setting this field to 0 leads to undefined behavior according to the
421 * the VS_STATE docs. Our VUEs will always have at least one attribute
422 * sitting in them, even if it's padding.
424 if (c
->prog_data
.urb_read_length
== 0)
425 c
->prog_data
.urb_read_length
= 1;
427 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
428 * them to fit the biggest thing they need to.
430 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
432 /* See emit_vertex_write() for where the VUE's overhead on top of the
433 * attributes comes from.
435 if (intel
->gen
>= 6) {
437 if (c
->key
.nr_userclip
)
440 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
441 * number of 128-byte (1024-bit) units.
443 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
444 } else if (intel
->gen
== 5)
445 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
446 * number of 64-byte (512-bit) units.
448 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
450 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
452 c
->prog_data
.total_grf
= reg
;
454 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
455 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
456 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
457 printf("%s reg = %d\n", __FUNCTION__
, reg
);
463 * If an instruction uses a temp reg both as a src and the dest, we
464 * sometimes need to allocate an intermediate temporary.
466 static void unalias1( struct brw_vs_compile
*c
,
469 void (*func
)( struct brw_vs_compile
*,
473 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
474 struct brw_compile
*p
= &c
->func
;
475 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
477 brw_MOV(p
, dst
, tmp
);
487 * Checkes if 2-operand instruction needs an intermediate temporary.
489 static void unalias2( struct brw_vs_compile
*c
,
493 void (*func
)( struct brw_vs_compile
*,
498 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
499 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
500 struct brw_compile
*p
= &c
->func
;
501 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
502 func(c
, tmp
, arg0
, arg1
);
503 brw_MOV(p
, dst
, tmp
);
507 func(c
, dst
, arg0
, arg1
);
513 * Checkes if 3-operand instruction needs an intermediate temporary.
515 static void unalias3( struct brw_vs_compile
*c
,
520 void (*func
)( struct brw_vs_compile
*,
526 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
527 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
528 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
529 struct brw_compile
*p
= &c
->func
;
530 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
531 func(c
, tmp
, arg0
, arg1
, arg2
);
532 brw_MOV(p
, dst
, tmp
);
536 func(c
, dst
, arg0
, arg1
, arg2
);
540 static void emit_sop( struct brw_vs_compile
*c
,
546 struct brw_compile
*p
= &c
->func
;
548 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
549 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
550 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
551 brw_set_predicate_control_flag_value(p
, 0xff);
554 static void emit_seq( struct brw_vs_compile
*c
,
557 struct brw_reg arg1
)
559 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
562 static void emit_sne( struct brw_vs_compile
*c
,
565 struct brw_reg arg1
)
567 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
569 static void emit_slt( struct brw_vs_compile
*c
,
572 struct brw_reg arg1
)
574 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
577 static void emit_sle( struct brw_vs_compile
*c
,
580 struct brw_reg arg1
)
582 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
585 static void emit_sgt( struct brw_vs_compile
*c
,
588 struct brw_reg arg1
)
590 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
593 static void emit_sge( struct brw_vs_compile
*c
,
596 struct brw_reg arg1
)
598 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
601 static void emit_cmp( struct brw_compile
*p
,
605 struct brw_reg arg2
)
607 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
608 brw_SEL(p
, dst
, arg1
, arg2
);
609 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
612 static void emit_sign(struct brw_vs_compile
*c
,
616 struct brw_compile
*p
= &c
->func
;
618 brw_MOV(p
, dst
, brw_imm_f(0));
620 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
621 brw_MOV(p
, dst
, brw_imm_f(-1.0));
622 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
624 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
625 brw_MOV(p
, dst
, brw_imm_f(1.0));
626 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
629 static void emit_max( struct brw_compile
*p
,
632 struct brw_reg arg1
)
634 struct intel_context
*intel
= &p
->brw
->intel
;
636 if (intel
->gen
>= 6) {
637 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
638 brw_SEL(p
, dst
, arg0
, arg1
);
639 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
640 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
642 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
643 brw_SEL(p
, dst
, arg0
, arg1
);
644 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
648 static void emit_min( struct brw_compile
*p
,
651 struct brw_reg arg1
)
653 struct intel_context
*intel
= &p
->brw
->intel
;
655 if (intel
->gen
>= 6) {
656 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
657 brw_SEL(p
, dst
, arg0
, arg1
);
658 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
659 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
661 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
662 brw_SEL(p
, dst
, arg0
, arg1
);
663 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
667 static void emit_arl(struct brw_compile
*p
,
671 struct intel_context
*intel
= &p
->brw
->intel
;
673 if (intel
->gen
>= 6) {
674 struct brw_reg dst_f
= retype(dst
, BRW_REGISTER_TYPE_F
);
676 brw_RNDD(p
, dst_f
, src
);
677 brw_MOV(p
, dst
, dst_f
);
679 brw_RNDD(p
, dst
, src
);
683 static void emit_math1_gen4(struct brw_vs_compile
*c
,
689 /* There are various odd behaviours with SEND on the simulator. In
690 * addition there are documented issues with the fact that the GEN4
691 * processor doesn't do dependency control properly on SEND
692 * results. So, on balance, this kludge to get around failures
693 * with writemasked math results looks like it might be necessary
694 * whether that turns out to be a simulator bug or not:
696 struct brw_compile
*p
= &c
->func
;
697 struct brw_reg tmp
= dst
;
698 GLboolean need_tmp
= GL_FALSE
;
700 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
701 dst
.dw1
.bits
.writemask
!= 0xf)
710 BRW_MATH_SATURATE_NONE
,
713 BRW_MATH_DATA_SCALAR
,
717 brw_MOV(p
, dst
, tmp
);
723 emit_math1_gen6(struct brw_vs_compile
*c
,
729 struct brw_compile
*p
= &c
->func
;
730 struct brw_reg tmp_src
, tmp_dst
;
732 /* Something is strange on gen6 math in 16-wide mode, though the
733 * docs say it's supposed to work. Punt to using align1 mode,
734 * which doesn't do writemasking and swizzles.
736 tmp_src
= get_tmp(c
);
737 tmp_dst
= get_tmp(c
);
739 brw_MOV(p
, tmp_src
, arg0
);
741 brw_set_access_mode(p
, BRW_ALIGN_1
);
745 BRW_MATH_SATURATE_NONE
,
748 BRW_MATH_DATA_SCALAR
,
750 brw_set_access_mode(p
, BRW_ALIGN_16
);
752 brw_MOV(p
, dst
, tmp_dst
);
754 release_tmp(c
, tmp_src
);
755 release_tmp(c
, tmp_dst
);
759 emit_math1(struct brw_vs_compile
*c
,
765 struct brw_compile
*p
= &c
->func
;
766 struct intel_context
*intel
= &p
->brw
->intel
;
769 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
771 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
774 static void emit_math2_gen4( struct brw_vs_compile
*c
,
781 struct brw_compile
*p
= &c
->func
;
782 struct brw_reg tmp
= dst
;
783 GLboolean need_tmp
= GL_FALSE
;
785 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
786 dst
.dw1
.bits
.writemask
!= 0xf)
792 brw_MOV(p
, brw_message_reg(3), arg1
);
797 BRW_MATH_SATURATE_NONE
,
800 BRW_MATH_DATA_SCALAR
,
804 brw_MOV(p
, dst
, tmp
);
809 static void emit_math2_gen6( struct brw_vs_compile
*c
,
816 struct brw_compile
*p
= &c
->func
;
817 struct brw_reg tmp_src0
, tmp_src1
, tmp_dst
;
819 tmp_src0
= get_tmp(c
);
820 tmp_src1
= get_tmp(c
);
821 tmp_dst
= get_tmp(c
);
823 brw_MOV(p
, tmp_src0
, arg0
);
824 brw_MOV(p
, tmp_src1
, arg1
);
826 brw_set_access_mode(p
, BRW_ALIGN_1
);
832 brw_set_access_mode(p
, BRW_ALIGN_16
);
834 brw_MOV(p
, dst
, tmp_dst
);
836 release_tmp(c
, tmp_src0
);
837 release_tmp(c
, tmp_src1
);
838 release_tmp(c
, tmp_dst
);
841 static void emit_math2( struct brw_vs_compile
*c
,
848 struct brw_compile
*p
= &c
->func
;
849 struct intel_context
*intel
= &p
->brw
->intel
;
852 emit_math2_gen6(c
, function
, dst
, arg0
, arg1
, precision
);
854 emit_math2_gen4(c
, function
, dst
, arg0
, arg1
, precision
);
857 static void emit_exp_noalias( struct brw_vs_compile
*c
,
859 struct brw_reg arg0
)
861 struct brw_compile
*p
= &c
->func
;
864 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
865 struct brw_reg tmp
= get_tmp(c
);
866 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
868 /* tmp_d = floor(arg0.x) */
869 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
871 /* result[0] = 2.0 ^ tmp */
873 /* Adjust exponent for floating point:
876 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
878 /* Install exponent and sign.
879 * Excess drops off the edge:
881 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
882 tmp_d
, brw_imm_d(23));
887 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
888 /* result[1] = arg0.x - floor(arg0.x) */
889 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
892 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
893 /* As with the LOG instruction, we might be better off just
894 * doing a taylor expansion here, seeing as we have to do all
897 * If mathbox partial precision is too low, consider also:
898 * result[3] = result[0] * EXP(result[1])
901 BRW_MATH_FUNCTION_EXP
,
902 brw_writemask(dst
, WRITEMASK_Z
),
903 brw_swizzle1(arg0
, 0),
904 BRW_MATH_PRECISION_FULL
);
907 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
908 /* result[3] = 1.0; */
909 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
914 static void emit_log_noalias( struct brw_vs_compile
*c
,
916 struct brw_reg arg0
)
918 struct brw_compile
*p
= &c
->func
;
919 struct brw_reg tmp
= dst
;
920 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
921 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
922 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
923 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
927 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
930 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
933 * These almost look likey they could be joined up, but not really
936 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
937 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
939 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
941 brw_writemask(tmp_ud
, WRITEMASK_X
),
942 brw_swizzle1(arg0_ud
, 0),
943 brw_imm_ud((1U<<31)-1));
946 brw_writemask(tmp_ud
, WRITEMASK_X
),
951 brw_writemask(tmp
, WRITEMASK_X
),
952 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
956 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
958 brw_writemask(tmp_ud
, WRITEMASK_Y
),
959 brw_swizzle1(arg0_ud
, 0),
960 brw_imm_ud((1<<23)-1));
963 brw_writemask(tmp_ud
, WRITEMASK_Y
),
965 brw_imm_ud(127<<23));
968 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
969 /* result[2] = result[0] + LOG2(result[1]); */
971 /* Why bother? The above is just a hint how to do this with a
972 * taylor series. Maybe we *should* use a taylor series as by
973 * the time all the above has been done it's almost certainly
974 * quicker than calling the mathbox, even with low precision.
977 * - result[0] + mathbox.LOG2(result[1])
978 * - mathbox.LOG2(arg0.x)
979 * - result[0] + inline_taylor_approx(result[1])
982 BRW_MATH_FUNCTION_LOG
,
983 brw_writemask(tmp
, WRITEMASK_Z
),
984 brw_swizzle1(tmp
, 1),
985 BRW_MATH_PRECISION_FULL
);
988 brw_writemask(tmp
, WRITEMASK_Z
),
989 brw_swizzle1(tmp
, 2),
990 brw_swizzle1(tmp
, 0));
993 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
994 /* result[3] = 1.0; */
995 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
999 brw_MOV(p
, dst
, tmp
);
1000 release_tmp(c
, tmp
);
1005 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1007 static void emit_dst_noalias( struct brw_vs_compile
*c
,
1009 struct brw_reg arg0
,
1010 struct brw_reg arg1
)
1012 struct brw_compile
*p
= &c
->func
;
1014 /* There must be a better way to do this:
1016 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
1017 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
1018 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
1019 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
1020 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
1021 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
1022 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
1023 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
1027 static void emit_xpd( struct brw_compile
*p
,
1032 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
1033 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
1037 static void emit_lit_noalias( struct brw_vs_compile
*c
,
1039 struct brw_reg arg0
)
1041 struct brw_compile
*p
= &c
->func
;
1042 struct brw_instruction
*if_insn
;
1043 struct brw_reg tmp
= dst
;
1044 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1049 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
1050 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
1052 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1053 * to get all channels active inside the IF. In the clipping code
1054 * we run with NoMask, so it's not an option and we can use
1055 * BRW_EXECUTE_1 for all comparisions.
1057 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
1058 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
1060 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
1062 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
1063 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
1064 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1067 BRW_MATH_FUNCTION_POW
,
1068 brw_writemask(dst
, WRITEMASK_Z
),
1069 brw_swizzle1(tmp
, 2),
1070 brw_swizzle1(arg0
, 3),
1071 BRW_MATH_PRECISION_PARTIAL
);
1074 brw_ENDIF(p
, if_insn
);
1076 release_tmp(c
, tmp
);
1079 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
1081 struct brw_reg arg0
,
1082 struct brw_reg arg1
,
1083 struct brw_reg arg2
)
1085 struct brw_compile
*p
= &c
->func
;
1087 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
1088 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
1089 brw_MAC(p
, dst
, arg0
, arg1
);
1092 /** 3 or 4-component vector normalization */
1093 static void emit_nrm( struct brw_vs_compile
*c
,
1095 struct brw_reg arg0
,
1098 struct brw_compile
*p
= &c
->func
;
1099 struct brw_reg tmp
= get_tmp(c
);
1101 /* tmp = dot(arg0, arg0) */
1103 brw_DP3(p
, tmp
, arg0
, arg0
);
1105 brw_DP4(p
, tmp
, arg0
, arg0
);
1107 /* tmp = 1 / sqrt(tmp) */
1108 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
1110 /* dst = arg0 * tmp */
1111 brw_MUL(p
, dst
, arg0
, tmp
);
1113 release_tmp(c
, tmp
);
1117 static struct brw_reg
1118 get_constant(struct brw_vs_compile
*c
,
1119 const struct prog_instruction
*inst
,
1122 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1123 struct brw_compile
*p
= &c
->func
;
1124 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1126 assert(argIndex
< 3);
1128 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1129 /* Keep track of the last constant loaded in this slot, for reuse. */
1130 c
->current_const
[argIndex
].index
= src
->Index
;
1133 printf(" fetch const[%d] for arg %d into reg %d\n",
1134 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1136 /* need to fetch the constant now */
1138 const_reg
, /* writeback dest */
1139 16 * src
->Index
, /* byte offset */
1140 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1144 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1145 const_reg
= stride(const_reg
, 0, 4, 1);
1146 const_reg
.subnr
= 0;
1151 static struct brw_reg
1152 get_reladdr_constant(struct brw_vs_compile
*c
,
1153 const struct prog_instruction
*inst
,
1156 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1157 struct brw_compile
*p
= &c
->func
;
1158 struct brw_context
*brw
= p
->brw
;
1159 struct intel_context
*intel
= &brw
->intel
;
1160 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1161 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1164 assert(argIndex
< 3);
1166 /* Can't reuse a reladdr constant load. */
1167 c
->current_const
[argIndex
].index
= -1;
1170 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1171 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1174 if (intel
->gen
>= 6) {
1175 offset
= src
->Index
;
1177 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1178 brw_MUL(p
, byte_addr_reg
, addr_reg
, brw_imm_d(16));
1179 addr_reg
= byte_addr_reg
;
1180 offset
= 16 * src
->Index
;
1183 /* fetch the first vec4 */
1184 brw_dp_READ_4_vs_relative(p
,
1188 SURF_INDEX_VERT_CONST_BUFFER
);
1195 /* TODO: relative addressing!
1197 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1198 gl_register_file file
,
1202 case PROGRAM_TEMPORARY
:
1204 case PROGRAM_OUTPUT
:
1205 assert(c
->regs
[file
][index
].nr
!= 0);
1206 return c
->regs
[file
][index
];
1207 case PROGRAM_STATE_VAR
:
1208 case PROGRAM_CONSTANT
:
1209 case PROGRAM_UNIFORM
:
1210 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1211 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1212 case PROGRAM_ADDRESS
:
1214 return c
->regs
[file
][index
];
1216 case PROGRAM_UNDEFINED
: /* undef values */
1217 return brw_null_reg();
1219 case PROGRAM_LOCAL_PARAM
:
1220 case PROGRAM_ENV_PARAM
:
1221 case PROGRAM_WRITE_ONLY
:
1224 return brw_null_reg();
1230 * Indirect addressing: get reg[[arg] + offset].
1232 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1237 struct brw_compile
*p
= &c
->func
;
1238 struct brw_reg tmp
= get_tmp(c
);
1239 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1240 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1241 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1242 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1243 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1245 /* Set the vertical stride on the register access so that the first
1246 * 4 components come from a0.0 and the second 4 from a0.1.
1248 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1251 brw_push_insn_state(p
);
1252 brw_set_access_mode(p
, BRW_ALIGN_1
);
1254 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1255 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1257 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1258 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1260 brw_MOV(p
, tmp
, indirect
);
1262 brw_pop_insn_state(p
);
1265 /* NOTE: tmp not released */
1270 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1271 const struct prog_instruction
*inst
,
1274 struct brw_compile
*p
= &c
->func
;
1276 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1277 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1278 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1279 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1280 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1281 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1283 /* Because destination register indirect addressing can only use
1284 * one index, we'll write each vertex's vec4 value separately.
1286 val
.width
= BRW_WIDTH_4
;
1287 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1289 brw_push_insn_state(p
);
1290 brw_set_access_mode(p
, BRW_ALIGN_1
);
1292 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1293 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1294 brw_MOV(p
, indirect
, val
);
1296 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1297 brw_ADD(p
, brw_address_reg(0), acc
,
1298 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1299 brw_MOV(p
, indirect
, suboffset(val
, 4));
1301 brw_pop_insn_state(p
);
1305 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1306 * TODO: relative addressing!
1308 static struct brw_reg
1309 get_src_reg( struct brw_vs_compile
*c
,
1310 const struct prog_instruction
*inst
,
1313 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1314 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1315 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1317 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1318 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1320 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1324 return brw_imm_f(0.0f
);
1325 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1330 return brw_imm_f(-1.0F
);
1332 return brw_imm_f(1.0F
);
1333 } else if (src
->File
== PROGRAM_CONSTANT
) {
1334 const struct gl_program_parameter_list
*params
;
1338 switch (src
->Swizzle
) {
1353 if (component
>= 0) {
1354 params
= c
->vp
->program
.Base
.Parameters
;
1355 f
= params
->ParameterValues
[src
->Index
][component
];
1361 return brw_imm_f(f
);
1367 case PROGRAM_TEMPORARY
:
1369 case PROGRAM_OUTPUT
:
1371 return deref(c
, c
->regs
[file
][0], index
, 32);
1374 assert(c
->regs
[file
][index
].nr
!= 0);
1375 return c
->regs
[file
][index
];
1378 case PROGRAM_STATE_VAR
:
1379 case PROGRAM_CONSTANT
:
1380 case PROGRAM_UNIFORM
:
1381 case PROGRAM_ENV_PARAM
:
1382 case PROGRAM_LOCAL_PARAM
:
1383 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1384 /* Take from the push constant buffer if possible. */
1385 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1386 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1388 /* Must be in the pull constant buffer then .*/
1389 assert(c
->vp
->use_const_buffer
);
1391 return get_reladdr_constant(c
, inst
, argIndex
);
1393 return get_constant(c
, inst
, argIndex
);
1395 case PROGRAM_ADDRESS
:
1397 return c
->regs
[file
][index
];
1399 case PROGRAM_UNDEFINED
:
1400 /* this is a normal case since we loop over all three src args */
1401 return brw_null_reg();
1403 case PROGRAM_WRITE_ONLY
:
1406 return brw_null_reg();
1411 * Return the brw reg for the given instruction's src argument.
1412 * Will return mangled results for SWZ op. The emit_swz() function
1413 * ignores this result and recalculates taking extended swizzles into
1416 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1417 const struct prog_instruction
*inst
,
1420 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1423 if (src
->File
== PROGRAM_UNDEFINED
)
1424 return brw_null_reg();
1426 reg
= get_src_reg(c
, inst
, argIndex
);
1428 /* Convert 3-bit swizzle to 2-bit.
1430 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1431 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1432 GET_SWZ(src
->Swizzle
, 1),
1433 GET_SWZ(src
->Swizzle
, 2),
1434 GET_SWZ(src
->Swizzle
, 3));
1436 /* Note this is ok for non-swizzle ARB_vp instructions */
1437 reg
.negate
= src
->Negate
? 1 : 0;
1445 * Get brw register for the given program dest register.
1447 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1448 struct prog_dst_register dst
)
1453 case PROGRAM_TEMPORARY
:
1454 case PROGRAM_OUTPUT
:
1455 /* register-indirect addressing is only 1x1, not VxH, for
1456 * destination regs. So, for RelAddr we'll return a temporary
1457 * for the dest and do a move of the result to the RelAddr
1458 * register after the instruction emit.
1463 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1464 reg
= c
->regs
[dst
.File
][dst
.Index
];
1467 case PROGRAM_ADDRESS
:
1468 assert(dst
.Index
== 0);
1469 reg
= c
->regs
[dst
.File
][dst
.Index
];
1471 case PROGRAM_UNDEFINED
:
1472 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1473 reg
= brw_null_reg();
1477 reg
= brw_null_reg();
1480 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1481 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1487 static void emit_swz( struct brw_vs_compile
*c
,
1489 const struct prog_instruction
*inst
)
1491 const GLuint argIndex
= 0;
1492 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1493 struct brw_compile
*p
= &c
->func
;
1494 GLuint zeros_mask
= 0;
1495 GLuint ones_mask
= 0;
1496 GLuint src_mask
= 0;
1498 GLboolean need_tmp
= (src
.Negate
&&
1499 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1500 struct brw_reg tmp
= dst
;
1506 for (i
= 0; i
< 4; i
++) {
1507 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1508 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1527 /* Do src first, in case dst aliases src:
1530 struct brw_reg arg0
;
1532 arg0
= get_src_reg(c
, inst
, argIndex
);
1534 arg0
= brw_swizzle(arg0
,
1535 src_swz
[0], src_swz
[1],
1536 src_swz
[2], src_swz
[3]);
1538 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1542 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1545 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1548 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1551 brw_MOV(p
, dst
, tmp
);
1552 release_tmp(c
, tmp
);
1557 align_interleaved_urb_mlen(struct brw_context
*brw
, int mlen
)
1559 struct intel_context
*intel
= &brw
->intel
;
1561 if (intel
->gen
>= 6) {
1562 /* URB data written (does not include the message header reg) must
1563 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1564 * section 5.4.3.2.2: URB_INTERLEAVED.
1566 * URB entries are allocated on a multiple of 1024 bits, so an
1567 * extra 128 bits written here to make the end align to 256 is
1570 if ((mlen
% 2) != 1)
1578 * Post-vertex-program processing. Send the results to the URB.
1580 static void emit_vertex_write( struct brw_vs_compile
*c
)
1582 struct brw_compile
*p
= &c
->func
;
1583 struct brw_context
*brw
= p
->brw
;
1584 struct intel_context
*intel
= &brw
->intel
;
1585 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1588 GLuint len_vertex_header
= 2;
1592 if (c
->key
.copy_edgeflag
) {
1594 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1595 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1598 if (intel
->gen
< 6) {
1599 /* Build ndc coords */
1601 /* ndc = 1.0 / pos.w */
1602 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1603 /* ndc.xyz = pos * ndc */
1604 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1607 /* Update the header for point size, user clipping flags, and -ve rhw
1610 if (intel
->gen
>= 6) {
1611 struct brw_reg m1
= brw_message_reg(1);
1613 /* On gen6, m1 has each value in a separate dword, so we never
1614 * need to mess with a temporary for computing the m1 value.
1616 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1617 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1618 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1619 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1622 /* Set the user clip distances in dword 8-15. (m3-4)*/
1623 if (c
->key
.nr_userclip
) {
1624 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1627 m
= brw_message_reg(3);
1629 m
= brw_message_reg(4);
1631 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 7))),pos
, c
->userplane
[i
]);
1634 } else if ((c
->prog_data
.outputs_written
&
1635 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1636 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1637 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1640 brw_MOV(p
, header1
, brw_imm_ud(0));
1642 brw_set_access_mode(p
, BRW_ALIGN_16
);
1644 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1645 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1646 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1647 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1648 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1649 header1
, brw_imm_ud(0x7ff<<8));
1652 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1653 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1654 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1655 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1656 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1659 /* i965 clipping workaround:
1660 * 1) Test for -ve rhw
1662 * set ndc = (0,0,0,0)
1665 * Later, clipping will detect ucp[6] and ensure the primitive is
1666 * clipped against all fixed planes.
1668 if (brw
->has_negative_rhw_bug
) {
1670 vec8(brw_null_reg()),
1672 brw_swizzle1(ndc
, 3),
1675 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1676 brw_MOV(p
, ndc
, brw_imm_f(0));
1677 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1680 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1681 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1682 brw_set_access_mode(p
, BRW_ALIGN_16
);
1684 release_tmp(c
, header1
);
1687 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1690 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1691 * of zeros followed by two sets of NDC coordinates:
1693 brw_set_access_mode(p
, BRW_ALIGN_1
);
1694 brw_set_acc_write_control(p
, 0);
1696 /* The VUE layout is documented in Volume 2a. */
1697 if (intel
->gen
>= 6) {
1698 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1699 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1700 * dword 4-7 (m2) is the 4D space position
1701 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1703 * m3 or 5 is the first vertex element data we fill, which is
1704 * the vertex position.
1706 brw_MOV(p
, brw_message_reg(2), pos
);
1707 len_vertex_header
= 1;
1708 if (c
->key
.nr_userclip
> 0)
1709 len_vertex_header
+= 2;
1710 } else if (intel
->gen
== 5) {
1711 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1712 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1713 * dword 4-7 (m2) is the ndc position (set above)
1714 * dword 8-11 (m3) of the vertex header is the 4D space position
1715 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1716 * m6 is a pad so that the vertex element data is aligned
1717 * m7 is the first vertex data we fill, which is the vertex position.
1719 brw_MOV(p
, brw_message_reg(2), ndc
);
1720 brw_MOV(p
, brw_message_reg(3), pos
);
1721 brw_MOV(p
, brw_message_reg(7), pos
);
1722 len_vertex_header
= 6;
1724 /* There are 8 dwords in VUE header pre-Ironlake:
1725 * dword 0-3 (m1) is indices, point width, clip flags.
1726 * dword 4-7 (m2) is ndc position (set above)
1728 * dword 8-11 (m3) is the first vertex data, which we always have be the
1731 brw_MOV(p
, brw_message_reg(2), ndc
);
1732 brw_MOV(p
, brw_message_reg(3), pos
);
1733 len_vertex_header
= 2;
1736 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1737 next_mrf
= 2 + len_vertex_header
;
1738 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1739 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1741 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1743 if (i
== VERT_RESULT_PSIZ
)
1746 if (i
>= VERT_RESULT_TEX0
&&
1747 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1748 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1750 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1751 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1755 eot
= (c
->first_overflow_output
== 0);
1757 /* Message header, plus VUE header, plus the (first set of) outputs. */
1758 msg_len
= 1 + len_vertex_header
+ c
->nr_outputs
;
1759 msg_len
= align_interleaved_urb_mlen(brw
, msg_len
);
1760 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1761 msg_len
= MIN2(msg_len
, (BRW_MAX_MRF
- 1)),
1764 brw_null_reg(), /* dest */
1765 0, /* starting mrf reg nr */
1770 0, /* response len */
1772 eot
, /* writes complete */
1773 0, /* urb destination offset */
1774 BRW_URB_SWIZZLE_INTERLEAVE
);
1776 if (c
->first_overflow_output
> 0) {
1777 /* Not all of the vertex outputs/results fit into the MRF.
1778 * Move the overflowed attributes from the GRF to the MRF and
1779 * issue another brw_urb_WRITE().
1782 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1783 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1784 /* move from GRF to MRF */
1785 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1791 brw_null_reg(), /* dest */
1792 0, /* starting mrf reg nr */
1796 align_interleaved_urb_mlen(brw
, mrf
),
1797 0, /* response len */
1799 1, /* writes complete */
1800 14 / 2, /* urb destination offset */
1801 BRW_URB_SWIZZLE_INTERLEAVE
);
1806 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1808 struct brw_compile
*p
= &c
->func
;
1809 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1811 if (p
->nr_insn
== 0)
1814 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1817 switch (prev_insn
->header
.opcode
) {
1818 case BRW_OPCODE_MOV
:
1819 case BRW_OPCODE_MAC
:
1820 case BRW_OPCODE_MUL
:
1821 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1822 prev_insn
->header
.execution_size
== val
.width
&&
1823 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1824 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1825 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1826 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1827 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1828 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1838 get_predicate(const struct prog_instruction
*inst
)
1840 if (inst
->DstReg
.CondMask
== COND_TR
)
1841 return BRW_PREDICATE_NONE
;
1843 /* All of GLSL only produces predicates for COND_NE and one channel per
1844 * vector. Fail badly if someone starts doing something else, as it might
1845 * mean infinite looping or something.
1847 * We'd like to support all the condition codes, but our hardware doesn't
1848 * quite match the Mesa IR, which is modeled after the NV extensions. For
1849 * those, the instruction may update the condition codes or not, then any
1850 * later instruction may use one of those condition codes. For gen4, the
1851 * instruction may update the flags register based on one of the condition
1852 * codes output by the instruction, and then further instructions may
1853 * predicate on that. We can probably support this, but it won't
1854 * necessarily be easy.
1856 assert(inst
->DstReg
.CondMask
== COND_NE
);
1858 switch (inst
->DstReg
.CondSwizzle
) {
1860 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1862 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1864 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1866 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1868 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1869 inst
->DstReg
.CondMask
);
1870 return BRW_PREDICATE_NORMAL
;
1874 /* Emit the vertex program instructions here.
1876 void brw_vs_emit(struct brw_vs_compile
*c
)
1878 #define MAX_IF_DEPTH 32
1879 #define MAX_LOOP_DEPTH 32
1880 struct brw_compile
*p
= &c
->func
;
1881 struct brw_context
*brw
= p
->brw
;
1882 struct intel_context
*intel
= &brw
->intel
;
1883 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1884 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1885 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1886 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1887 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1891 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1892 printf("vs-mesa:\n");
1893 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1898 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1899 brw_set_access_mode(p
, BRW_ALIGN_16
);
1900 if_depth_in_loop
[loop_depth
] = 0;
1902 brw_set_acc_write_control(p
, 1);
1904 for (insn
= 0; insn
< nr_insns
; insn
++) {
1906 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1908 /* Message registers can't be read, so copy the output into GRF
1909 * register if they are used in source registers
1911 for (i
= 0; i
< 3; i
++) {
1912 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1913 GLuint index
= src
->Index
;
1914 GLuint file
= src
->File
;
1915 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1916 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1919 switch (inst
->Opcode
) {
1922 c
->needs_stack
= GL_TRUE
;
1929 /* Static register allocation
1931 brw_vs_alloc_regs(c
);
1934 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1936 for (insn
= 0; insn
< nr_insns
; insn
++) {
1938 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1939 struct brw_reg args
[3], dst
;
1943 printf("%d: ", insn
);
1944 _mesa_print_instruction(inst
);
1947 /* Get argument regs. SWZ is special and does this itself.
1949 if (inst
->Opcode
!= OPCODE_SWZ
)
1950 for (i
= 0; i
< 3; i
++) {
1951 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1954 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1955 args
[i
] = c
->output_regs
[index
].reg
;
1957 args
[i
] = get_arg(c
, inst
, i
);
1960 /* Get dest regs. Note that it is possible for a reg to be both
1961 * dst and arg, given the static allocation of registers. So
1962 * care needs to be taken emitting multi-operation instructions.
1964 index
= inst
->DstReg
.Index
;
1965 file
= inst
->DstReg
.File
;
1966 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1967 dst
= c
->output_regs
[index
].reg
;
1969 dst
= get_dst(c
, inst
->DstReg
);
1971 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1972 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1973 inst
->SaturateMode
);
1976 switch (inst
->Opcode
) {
1978 args
[0].negate
= false;
1979 brw_MOV(p
, dst
, brw_abs(args
[0]));
1982 brw_ADD(p
, dst
, args
[0], args
[1]);
1985 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1988 brw_DP2(p
, dst
, args
[0], args
[1]);
1991 brw_DP3(p
, dst
, args
[0], args
[1]);
1994 brw_DP4(p
, dst
, args
[0], args
[1]);
1997 brw_DPH(p
, dst
, args
[0], args
[1]);
2000 emit_nrm(c
, dst
, args
[0], 3);
2003 emit_nrm(c
, dst
, args
[0], 4);
2006 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
2009 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
2012 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2015 emit_arl(p
, dst
, args
[0]);
2018 brw_RNDD(p
, dst
, args
[0]);
2021 brw_FRC(p
, dst
, args
[0]);
2024 unalias1(c
, dst
, args
[0], emit_log_noalias
);
2027 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2030 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
2033 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
2036 if (!accumulator_contains(c
, args
[2]))
2037 brw_MOV(p
, brw_acc_reg(), args
[2]);
2038 brw_MAC(p
, dst
, args
[0], args
[1]);
2041 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
2044 emit_max(p
, dst
, args
[0], args
[1]);
2047 emit_min(p
, dst
, args
[0], args
[1]);
2050 brw_MOV(p
, dst
, args
[0]);
2053 brw_MUL(p
, dst
, args
[0], args
[1]);
2056 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
2059 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2062 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, brw_abs(args
[0]), BRW_MATH_PRECISION_FULL
);
2066 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
2069 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2072 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
2075 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
2078 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
2081 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
2084 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
2087 unalias1(c
, dst
, args
[0], emit_sign
);
2090 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
2093 /* The args[0] value can't be used here as it won't have
2094 * correctly encoded the full swizzle:
2096 emit_swz(c
, dst
, inst
);
2099 /* round toward zero */
2100 brw_RNDZ(p
, dst
, args
[0]);
2103 emit_xpd(p
, dst
, args
[0], args
[1]);
2106 assert(if_depth
< MAX_IF_DEPTH
);
2107 if_inst
[if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
2108 /* Note that brw_IF smashes the predicate_control field. */
2109 if_inst
[if_depth
]->header
.predicate_control
= get_predicate(inst
);
2110 if_depth_in_loop
[loop_depth
]++;
2114 clear_current_const(c
);
2115 assert(if_depth
> 0);
2116 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
2119 clear_current_const(c
);
2120 assert(if_depth
> 0);
2121 brw_ENDIF(p
, if_inst
[--if_depth
]);
2122 if_depth_in_loop
[loop_depth
]--;
2124 case OPCODE_BGNLOOP
:
2125 clear_current_const(c
);
2126 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2127 if_depth_in_loop
[loop_depth
] = 0;
2130 brw_set_predicate_control(p
, get_predicate(inst
));
2131 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
2132 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2135 brw_set_predicate_control(p
, get_predicate(inst
));
2136 if (intel
->gen
>= 6) {
2137 gen6_CONT(p
, loop_inst
[loop_depth
- 1]);
2139 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
2141 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2144 case OPCODE_ENDLOOP
: {
2145 clear_current_const(c
);
2146 struct brw_instruction
*inst0
, *inst1
;
2151 if (intel
->gen
== 5)
2154 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2156 if (intel
->gen
< 6) {
2157 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2158 while (inst0
> loop_inst
[loop_depth
]) {
2160 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2161 inst0
->bits3
.if_else
.jump_count
== 0) {
2162 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2163 } else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2164 inst0
->bits3
.if_else
.jump_count
== 0) {
2165 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2173 brw_set_predicate_control(p
, get_predicate(inst
));
2174 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2175 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2178 brw_set_access_mode(p
, BRW_ALIGN_1
);
2179 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2180 brw_set_access_mode(p
, BRW_ALIGN_16
);
2181 brw_ADD(p
, get_addr_reg(stack_index
),
2182 get_addr_reg(stack_index
), brw_imm_d(4));
2183 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2184 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2187 brw_ADD(p
, get_addr_reg(stack_index
),
2188 get_addr_reg(stack_index
), brw_imm_d(-4));
2189 brw_set_access_mode(p
, BRW_ALIGN_1
);
2190 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2191 brw_set_access_mode(p
, BRW_ALIGN_16
);
2194 emit_vertex_write(c
);
2200 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2206 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2207 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2208 _mesa_opcode_string(inst
->Opcode
) :
2212 /* Set the predication update on the last instruction of the native
2213 * instruction sequence.
2215 * This would be problematic if it was set on a math instruction,
2216 * but that shouldn't be the case with the current GLSL compiler.
2218 if (inst
->CondUpdate
) {
2219 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2221 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2222 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2225 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2226 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2227 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2228 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2231 /* Result color clamping.
2233 * When destination register is an output register and
2234 * it's primary/secondary front/back color, we have to clamp
2235 * the result to [0,1]. This is done by enabling the
2236 * saturation bit for the last instruction.
2238 * We don't use brw_set_saturate() as it modifies
2239 * p->current->header.saturate, which affects all the subsequent
2240 * instructions. Instead, we directly modify the header
2241 * of the last (already stored) instruction.
2243 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
2244 c
->key
.clamp_vertex_color
) {
2245 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2246 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2247 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2248 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2249 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2253 if (inst
->DstReg
.RelAddr
) {
2254 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2255 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2256 move_to_reladdr_dst(c
, inst
, dst
);
2262 brw_resolve_cals(p
);
2267 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2270 printf("vs-native:\n");
2271 for (i
= 0; i
< p
->nr_insn
; i
++)
2272 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);