2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
135 /* The message length for all SEND messages is restricted to [1,15]. This
136 * includes 1 for the header, so anything in slots 14 and above needs to be
137 * placed in a general-purpose register and emitted using a second URB write.
139 #define MAX_SLOTS_IN_FIRST_URB_WRITE 14
142 * Determine whether the given vertex output can be written directly to a MRF
143 * or whether it has to be stored in a general-purpose register.
145 static inline bool can_use_direct_mrf(int vert_result
,
146 int first_reladdr_output
, int slot
)
148 if (vert_result
== VERT_RESULT_HPOS
|| vert_result
== VERT_RESULT_PSIZ
) {
149 /* These never go straight into MRF's. They are placed in the MRF by
154 if (first_reladdr_output
<= vert_result
&& vert_result
< VERT_RESULT_MAX
) {
155 /* Relative addressing might be used to access this vert_result, so it
156 * needs to go into a general-purpose register.
160 if (slot
>= MAX_SLOTS_IN_FIRST_URB_WRITE
) {
161 /* This output won't go out until the second URB write so it must be
162 * stored in a general-purpose register until then.
170 * Preallocate GRF register before code emit.
171 * Do things as simply as possible. Allocate and populate all regs
174 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
176 struct intel_context
*intel
= &c
->func
.brw
->intel
;
177 GLuint i
, reg
= 0, slot
;
178 int attributes_in_vue
;
179 int first_reladdr_output
;
182 struct brw_vertex_program
*vp
= c
->vp
;
183 const struct gl_program_parameter_list
*params
= vp
->program
.Base
.Parameters
;
185 /* Determine whether to use a real constant buffer or use a block
186 * of GRF registers for constants. The later is faster but only
187 * works if everything fits in the GRF.
188 * XXX this heuristic/check may need some fine tuning...
190 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
191 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
192 c
->vp
->use_const_buffer
= true;
194 c
->vp
->use_const_buffer
= false;
196 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
198 /* r0 -- reserved as usual
200 c
->r0
= brw_vec8_grf(reg
, 0);
203 /* User clip planes from curbe:
205 if (c
->key
.userclip_active
) {
206 if (intel
->gen
>= 6) {
207 for (i
= 0; i
<= c
->key
.nr_userclip_plane_consts
; i
++) {
208 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
209 (i
% 2) * 4), 0, 4, 1);
211 reg
+= ALIGN(c
->key
.nr_userclip_plane_consts
, 2) / 2;
213 for (i
= 0; i
< c
->key
.nr_userclip_plane_consts
; i
++) {
214 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
215 (i
% 2) * 4), 0, 4, 1);
217 reg
+= (ALIGN(6 + c
->key
.nr_userclip_plane_consts
, 4) / 4) * 2;
222 /* Assign some (probably all) of the vertex program constants to
223 * the push constant buffer/CURBE.
225 * There's an obvious limit to the numer of push constants equal to
226 * the number of register available, and that number is smaller
227 * than the minimum maximum number of vertex program parameters, so
228 * support for pull constants is required if we overflow.
229 * Additionally, on gen6 the number of push constants is even
232 * When there's relative addressing, we don't know what range of
233 * Mesa IR registers can be accessed. And generally, when relative
234 * addressing is used we also have too many constants to load them
235 * all as push constants. So, we'll just support relative
236 * addressing out of the pull constant buffers, and try to load as
237 * many statically-accessed constants into the push constant buffer
240 if (intel
->gen
>= 6) {
241 /* We can only load 32 regs of push constants. */
242 max_constant
= 32 * 2 - c
->key
.nr_userclip_plane_consts
;
244 max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
247 /* constant_map maps from ParameterValues[] index to index in the
248 * push constant buffer, or -1 if it's only in the pull constant
251 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
253 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
255 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
258 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
259 if (inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
260 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
261 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
262 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
263 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) {
267 if (inst
->SrcReg
[arg
].RelAddr
) {
268 c
->vp
->use_const_buffer
= true;
272 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
273 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
278 /* If we ran out of push constant space, then we'll also upload all
279 * constants through the pull constant buffer so that they can be
280 * accessed no matter what. For relative addressing (the common
281 * case) we need them all in place anyway.
283 if (constant
== max_constant
)
284 c
->vp
->use_const_buffer
= true;
286 /* Set up the references to the pull parameters if present. This backend
287 * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
288 * buffer, while the new VS backend allocates values to the pull buffer on
291 if (c
->vp
->use_const_buffer
) {
292 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
293 c
->prog_data
.pull_param
[i
] = ¶ms
->ParameterValues
[i
/ 4][i
% 4].f
;
295 c
->prog_data
.nr_pull_params
= i
;
298 for (i
= 0; i
< constant
; i
++) {
299 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
303 reg
+= (constant
+ 1) / 2;
304 c
->prog_data
.curb_read_length
= reg
- 1;
305 c
->prog_data
.nr_params
= constant
* 4;
306 /* XXX 0 causes a bug elsewhere... */
307 if (intel
->gen
< 6 && c
->prog_data
.nr_params
== 0)
308 c
->prog_data
.nr_params
= 4;
310 /* Allocate input regs:
313 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
314 if (c
->prog_data
.inputs_read
& BITFIELD64_BIT(i
)) {
316 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
320 /* If there are no inputs, we'll still be reading one attribute's worth
321 * because it's required -- see urb_read_length setting.
323 if (c
->nr_inputs
== 0)
326 /* Allocate outputs. The non-position outputs go straight into message regs.
328 brw_compute_vue_map(&c
->vue_map
, intel
, c
->key
.userclip_active
,
329 c
->prog_data
.outputs_written
);
330 c
->first_output
= reg
;
332 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
334 for (slot
= 0; slot
< c
->vue_map
.num_slots
; slot
++) {
335 int vert_result
= c
->vue_map
.slot_to_vert_result
[slot
];
336 assert(vert_result
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
337 if (can_use_direct_mrf(vert_result
, first_reladdr_output
, slot
)) {
338 c
->regs
[PROGRAM_OUTPUT
][vert_result
] = brw_message_reg(slot
+ 1);
340 c
->regs
[PROGRAM_OUTPUT
][vert_result
] = brw_vec8_grf(reg
, 0);
345 /* Allocate program temporaries:
347 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
348 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
352 /* Address reg(s). Don't try to use the internal address reg until
355 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
356 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
360 BRW_VERTICAL_STRIDE_8
,
362 BRW_HORIZONTAL_STRIDE_1
,
368 if (c
->vp
->use_const_buffer
) {
369 for (i
= 0; i
< 3; i
++) {
370 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
373 clear_current_const(c
);
376 for (i
= 0; i
< 128; i
++) {
377 if (c
->output_regs
[i
].used_in_src
) {
378 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
383 if (c
->needs_stack
) {
384 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
388 /* Some opcodes need an internal temporary:
391 c
->last_tmp
= reg
; /* for allocation purposes */
393 /* Each input reg holds data from two vertices. The
394 * urb_read_length is the number of registers read from *each*
395 * vertex urb, so is half the amount:
397 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
398 /* Setting this field to 0 leads to undefined behavior according to the
399 * the VS_STATE docs. Our VUEs will always have at least one attribute
400 * sitting in them, even if it's padding.
402 if (c
->prog_data
.urb_read_length
== 0)
403 c
->prog_data
.urb_read_length
= 1;
405 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
406 * them to fit the biggest thing they need to.
408 attributes_in_vue
= MAX2(c
->vue_map
.num_slots
, c
->nr_inputs
);
410 if (intel
->gen
== 6) {
411 /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
412 * number of 128-byte (1024-bit) units.
414 c
->prog_data
.urb_entry_size
= ALIGN(attributes_in_vue
, 8) / 8;
416 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
417 * number of 64-byte (512-bit) units.
419 c
->prog_data
.urb_entry_size
= ALIGN(attributes_in_vue
, 4) / 4;
422 c
->prog_data
.total_grf
= reg
;
424 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
425 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
426 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
427 printf("%s reg = %d\n", __FUNCTION__
, reg
);
433 * If an instruction uses a temp reg both as a src and the dest, we
434 * sometimes need to allocate an intermediate temporary.
436 static void unalias1( struct brw_vs_compile
*c
,
439 void (*func
)( struct brw_vs_compile
*,
443 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
444 struct brw_compile
*p
= &c
->func
;
445 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
447 brw_MOV(p
, dst
, tmp
);
457 * Checkes if 2-operand instruction needs an intermediate temporary.
459 static void unalias2( struct brw_vs_compile
*c
,
463 void (*func
)( struct brw_vs_compile
*,
468 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
469 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
470 struct brw_compile
*p
= &c
->func
;
471 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
472 func(c
, tmp
, arg0
, arg1
);
473 brw_MOV(p
, dst
, tmp
);
477 func(c
, dst
, arg0
, arg1
);
483 * Checkes if 3-operand instruction needs an intermediate temporary.
485 static void unalias3( struct brw_vs_compile
*c
,
490 void (*func
)( struct brw_vs_compile
*,
496 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
497 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
498 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
499 struct brw_compile
*p
= &c
->func
;
500 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
501 func(c
, tmp
, arg0
, arg1
, arg2
);
502 brw_MOV(p
, dst
, tmp
);
506 func(c
, dst
, arg0
, arg1
, arg2
);
510 static void emit_sop( struct brw_vs_compile
*c
,
516 struct brw_compile
*p
= &c
->func
;
518 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
519 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
520 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
521 brw_set_predicate_control_flag_value(p
, 0xff);
524 static void emit_seq( struct brw_vs_compile
*c
,
527 struct brw_reg arg1
)
529 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
532 static void emit_sne( struct brw_vs_compile
*c
,
535 struct brw_reg arg1
)
537 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
539 static void emit_slt( struct brw_vs_compile
*c
,
542 struct brw_reg arg1
)
544 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
547 static void emit_sle( struct brw_vs_compile
*c
,
550 struct brw_reg arg1
)
552 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
555 static void emit_sgt( struct brw_vs_compile
*c
,
558 struct brw_reg arg1
)
560 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
563 static void emit_sge( struct brw_vs_compile
*c
,
566 struct brw_reg arg1
)
568 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
571 static void emit_cmp( struct brw_compile
*p
,
575 struct brw_reg arg2
)
577 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
578 brw_SEL(p
, dst
, arg1
, arg2
);
579 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
582 static void emit_sign(struct brw_vs_compile
*c
,
586 struct brw_compile
*p
= &c
->func
;
588 brw_MOV(p
, dst
, brw_imm_f(0));
590 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
591 brw_MOV(p
, dst
, brw_imm_f(-1.0));
592 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
594 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
595 brw_MOV(p
, dst
, brw_imm_f(1.0));
596 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
599 static void emit_max( struct brw_compile
*p
,
602 struct brw_reg arg1
)
604 struct intel_context
*intel
= &p
->brw
->intel
;
606 if (intel
->gen
>= 6) {
607 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
608 brw_SEL(p
, dst
, arg0
, arg1
);
609 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
610 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
612 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
613 brw_SEL(p
, dst
, arg0
, arg1
);
614 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
618 static void emit_min( struct brw_compile
*p
,
621 struct brw_reg arg1
)
623 struct intel_context
*intel
= &p
->brw
->intel
;
625 if (intel
->gen
>= 6) {
626 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
627 brw_SEL(p
, dst
, arg0
, arg1
);
628 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
629 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
631 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
632 brw_SEL(p
, dst
, arg0
, arg1
);
633 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
637 static void emit_arl(struct brw_compile
*p
,
641 struct intel_context
*intel
= &p
->brw
->intel
;
643 if (intel
->gen
>= 6) {
644 struct brw_reg dst_f
= retype(dst
, BRW_REGISTER_TYPE_F
);
646 brw_RNDD(p
, dst_f
, src
);
647 brw_MOV(p
, dst
, dst_f
);
649 brw_RNDD(p
, dst
, src
);
653 static void emit_math1_gen4(struct brw_vs_compile
*c
,
659 /* There are various odd behaviours with SEND on the simulator. In
660 * addition there are documented issues with the fact that the GEN4
661 * processor doesn't do dependency control properly on SEND
662 * results. So, on balance, this kludge to get around failures
663 * with writemasked math results looks like it might be necessary
664 * whether that turns out to be a simulator bug or not:
666 struct brw_compile
*p
= &c
->func
;
667 struct brw_reg tmp
= dst
;
668 bool need_tmp
= false;
670 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
671 dst
.dw1
.bits
.writemask
!= 0xf)
680 BRW_MATH_SATURATE_NONE
,
683 BRW_MATH_DATA_SCALAR
,
687 brw_MOV(p
, dst
, tmp
);
693 emit_math1_gen6(struct brw_vs_compile
*c
,
699 struct brw_compile
*p
= &c
->func
;
700 struct brw_reg tmp_src
, tmp_dst
;
702 /* Something is strange on gen6 math in 16-wide mode, though the
703 * docs say it's supposed to work. Punt to using align1 mode,
704 * which doesn't do writemasking and swizzles.
706 tmp_src
= get_tmp(c
);
707 tmp_dst
= get_tmp(c
);
709 brw_MOV(p
, tmp_src
, arg0
);
711 brw_set_access_mode(p
, BRW_ALIGN_1
);
715 BRW_MATH_SATURATE_NONE
,
718 BRW_MATH_DATA_SCALAR
,
720 brw_set_access_mode(p
, BRW_ALIGN_16
);
722 brw_MOV(p
, dst
, tmp_dst
);
724 release_tmp(c
, tmp_src
);
725 release_tmp(c
, tmp_dst
);
729 emit_math1(struct brw_vs_compile
*c
,
735 struct brw_compile
*p
= &c
->func
;
736 struct intel_context
*intel
= &p
->brw
->intel
;
739 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
741 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
744 static void emit_math2_gen4( struct brw_vs_compile
*c
,
751 struct brw_compile
*p
= &c
->func
;
752 struct brw_reg tmp
= dst
;
753 bool need_tmp
= false;
755 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
756 dst
.dw1
.bits
.writemask
!= 0xf)
762 brw_MOV(p
, brw_message_reg(3), arg1
);
767 BRW_MATH_SATURATE_NONE
,
770 BRW_MATH_DATA_SCALAR
,
774 brw_MOV(p
, dst
, tmp
);
779 static void emit_math2_gen6( struct brw_vs_compile
*c
,
786 struct brw_compile
*p
= &c
->func
;
787 struct brw_reg tmp_src0
, tmp_src1
, tmp_dst
;
789 tmp_src0
= get_tmp(c
);
790 tmp_src1
= get_tmp(c
);
791 tmp_dst
= get_tmp(c
);
793 brw_MOV(p
, tmp_src0
, arg0
);
794 brw_MOV(p
, tmp_src1
, arg1
);
796 brw_set_access_mode(p
, BRW_ALIGN_1
);
802 brw_set_access_mode(p
, BRW_ALIGN_16
);
804 brw_MOV(p
, dst
, tmp_dst
);
806 release_tmp(c
, tmp_src0
);
807 release_tmp(c
, tmp_src1
);
808 release_tmp(c
, tmp_dst
);
811 static void emit_math2( struct brw_vs_compile
*c
,
818 struct brw_compile
*p
= &c
->func
;
819 struct intel_context
*intel
= &p
->brw
->intel
;
822 emit_math2_gen6(c
, function
, dst
, arg0
, arg1
, precision
);
824 emit_math2_gen4(c
, function
, dst
, arg0
, arg1
, precision
);
827 static void emit_exp_noalias( struct brw_vs_compile
*c
,
829 struct brw_reg arg0
)
831 struct brw_compile
*p
= &c
->func
;
834 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
835 struct brw_reg tmp
= get_tmp(c
);
836 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
838 /* tmp_d = floor(arg0.x) */
839 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
841 /* result[0] = 2.0 ^ tmp */
843 /* Adjust exponent for floating point:
846 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
848 /* Install exponent and sign.
849 * Excess drops off the edge:
851 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
852 tmp_d
, brw_imm_d(23));
857 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
858 /* result[1] = arg0.x - floor(arg0.x) */
859 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
862 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
863 /* As with the LOG instruction, we might be better off just
864 * doing a taylor expansion here, seeing as we have to do all
867 * If mathbox partial precision is too low, consider also:
868 * result[3] = result[0] * EXP(result[1])
871 BRW_MATH_FUNCTION_EXP
,
872 brw_writemask(dst
, WRITEMASK_Z
),
873 brw_swizzle1(arg0
, 0),
874 BRW_MATH_PRECISION_FULL
);
877 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
878 /* result[3] = 1.0; */
879 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
884 static void emit_log_noalias( struct brw_vs_compile
*c
,
886 struct brw_reg arg0
)
888 struct brw_compile
*p
= &c
->func
;
889 struct brw_reg tmp
= dst
;
890 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
891 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
892 bool need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
893 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
897 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
900 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
903 * These almost look likey they could be joined up, but not really
906 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
907 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
909 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
911 brw_writemask(tmp_ud
, WRITEMASK_X
),
912 brw_swizzle1(arg0_ud
, 0),
913 brw_imm_ud((1U<<31)-1));
916 brw_writemask(tmp_ud
, WRITEMASK_X
),
921 brw_writemask(tmp
, WRITEMASK_X
),
922 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
926 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
928 brw_writemask(tmp_ud
, WRITEMASK_Y
),
929 brw_swizzle1(arg0_ud
, 0),
930 brw_imm_ud((1<<23)-1));
933 brw_writemask(tmp_ud
, WRITEMASK_Y
),
935 brw_imm_ud(127<<23));
938 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
939 /* result[2] = result[0] + LOG2(result[1]); */
941 /* Why bother? The above is just a hint how to do this with a
942 * taylor series. Maybe we *should* use a taylor series as by
943 * the time all the above has been done it's almost certainly
944 * quicker than calling the mathbox, even with low precision.
947 * - result[0] + mathbox.LOG2(result[1])
948 * - mathbox.LOG2(arg0.x)
949 * - result[0] + inline_taylor_approx(result[1])
952 BRW_MATH_FUNCTION_LOG
,
953 brw_writemask(tmp
, WRITEMASK_Z
),
954 brw_swizzle1(tmp
, 1),
955 BRW_MATH_PRECISION_FULL
);
958 brw_writemask(tmp
, WRITEMASK_Z
),
959 brw_swizzle1(tmp
, 2),
960 brw_swizzle1(tmp
, 0));
963 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
964 /* result[3] = 1.0; */
965 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
969 brw_MOV(p
, dst
, tmp
);
975 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
977 static void emit_dst_noalias( struct brw_vs_compile
*c
,
982 struct brw_compile
*p
= &c
->func
;
984 /* There must be a better way to do this:
986 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
987 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
988 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
989 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
990 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
991 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
992 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
993 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
997 static void emit_xpd( struct brw_compile
*p
,
1002 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
1003 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
1007 static void emit_lit_noalias( struct brw_vs_compile
*c
,
1009 struct brw_reg arg0
)
1011 struct brw_compile
*p
= &c
->func
;
1012 struct brw_reg tmp
= dst
;
1013 bool need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1018 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
1019 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
1021 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1022 * to get all channels active inside the IF. In the clipping code
1023 * we run with NoMask, so it's not an option and we can use
1024 * BRW_EXECUTE_1 for all comparisions.
1026 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
1027 brw_IF(p
, BRW_EXECUTE_8
);
1029 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
1031 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
1032 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
1033 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1036 BRW_MATH_FUNCTION_POW
,
1037 brw_writemask(dst
, WRITEMASK_Z
),
1038 brw_swizzle1(tmp
, 2),
1039 brw_swizzle1(arg0
, 3),
1040 BRW_MATH_PRECISION_PARTIAL
);
1044 release_tmp(c
, tmp
);
1047 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
1049 struct brw_reg arg0
,
1050 struct brw_reg arg1
,
1051 struct brw_reg arg2
)
1053 struct brw_compile
*p
= &c
->func
;
1055 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
1056 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
1057 brw_MAC(p
, dst
, arg0
, arg1
);
1060 static struct brw_reg
1061 get_constant(struct brw_vs_compile
*c
,
1062 const struct prog_instruction
*inst
,
1065 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1066 struct brw_compile
*p
= &c
->func
;
1067 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1069 assert(argIndex
< 3);
1071 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1072 /* Keep track of the last constant loaded in this slot, for reuse. */
1073 c
->current_const
[argIndex
].index
= src
->Index
;
1076 printf(" fetch const[%d] for arg %d into reg %d\n",
1077 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1079 /* need to fetch the constant now */
1081 const_reg
, /* writeback dest */
1082 16 * src
->Index
, /* byte offset */
1083 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1087 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1088 const_reg
= stride(const_reg
, 0, 4, 1);
1089 const_reg
.subnr
= 0;
1094 static struct brw_reg
1095 get_reladdr_constant(struct brw_vs_compile
*c
,
1096 const struct prog_instruction
*inst
,
1099 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1100 struct brw_compile
*p
= &c
->func
;
1101 struct brw_context
*brw
= p
->brw
;
1102 struct intel_context
*intel
= &brw
->intel
;
1103 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1104 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1107 assert(argIndex
< 3);
1109 /* Can't reuse a reladdr constant load. */
1110 c
->current_const
[argIndex
].index
= -1;
1113 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1114 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1117 if (intel
->gen
>= 6) {
1118 offset
= src
->Index
;
1120 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1121 brw_MUL(p
, byte_addr_reg
, addr_reg
, brw_imm_d(16));
1122 addr_reg
= byte_addr_reg
;
1123 offset
= 16 * src
->Index
;
1126 /* fetch the first vec4 */
1127 brw_dp_READ_4_vs_relative(p
,
1131 SURF_INDEX_VERT_CONST_BUFFER
);
1138 /* TODO: relative addressing!
1140 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1141 gl_register_file file
,
1145 case PROGRAM_TEMPORARY
:
1147 case PROGRAM_OUTPUT
:
1148 assert(c
->regs
[file
][index
].nr
!= 0);
1149 return c
->regs
[file
][index
];
1150 case PROGRAM_STATE_VAR
:
1151 case PROGRAM_CONSTANT
:
1152 case PROGRAM_UNIFORM
:
1153 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1154 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1155 case PROGRAM_ADDRESS
:
1157 return c
->regs
[file
][index
];
1159 case PROGRAM_UNDEFINED
: /* undef values */
1160 return brw_null_reg();
1162 case PROGRAM_LOCAL_PARAM
:
1163 case PROGRAM_ENV_PARAM
:
1164 case PROGRAM_WRITE_ONLY
:
1167 return brw_null_reg();
1173 * Indirect addressing: get reg[[arg] + offset].
1175 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1180 struct brw_compile
*p
= &c
->func
;
1181 struct brw_reg tmp
= get_tmp(c
);
1182 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1183 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1184 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1185 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1186 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1188 /* Set the vertical stride on the register access so that the first
1189 * 4 components come from a0.0 and the second 4 from a0.1.
1191 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1194 brw_push_insn_state(p
);
1195 brw_set_access_mode(p
, BRW_ALIGN_1
);
1197 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1198 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1200 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1201 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1203 brw_MOV(p
, tmp
, indirect
);
1205 brw_pop_insn_state(p
);
1208 /* NOTE: tmp not released */
1213 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1214 const struct prog_instruction
*inst
,
1217 struct brw_compile
*p
= &c
->func
;
1219 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1220 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1221 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1222 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1223 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1224 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1226 /* Because destination register indirect addressing can only use
1227 * one index, we'll write each vertex's vec4 value separately.
1229 val
.width
= BRW_WIDTH_4
;
1230 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1232 brw_push_insn_state(p
);
1233 brw_set_access_mode(p
, BRW_ALIGN_1
);
1235 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1236 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1237 brw_MOV(p
, indirect
, val
);
1239 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1240 brw_ADD(p
, brw_address_reg(0), acc
,
1241 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1242 brw_MOV(p
, indirect
, suboffset(val
, 4));
1244 brw_pop_insn_state(p
);
1248 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1249 * TODO: relative addressing!
1251 static struct brw_reg
1252 get_src_reg( struct brw_vs_compile
*c
,
1253 const struct prog_instruction
*inst
,
1256 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1257 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1258 const bool relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1260 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1261 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1263 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1267 return brw_imm_f(0.0f
);
1268 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1273 return brw_imm_f(-1.0F
);
1275 return brw_imm_f(1.0F
);
1276 } else if (src
->File
== PROGRAM_CONSTANT
) {
1277 const struct gl_program_parameter_list
*params
;
1281 switch (src
->Swizzle
) {
1296 if (component
>= 0) {
1297 params
= c
->vp
->program
.Base
.Parameters
;
1298 f
= params
->ParameterValues
[src
->Index
][component
].f
;
1304 return brw_imm_f(f
);
1310 case PROGRAM_TEMPORARY
:
1312 case PROGRAM_OUTPUT
:
1314 return deref(c
, c
->regs
[file
][0], index
, 32);
1317 assert(c
->regs
[file
][index
].nr
!= 0);
1318 return c
->regs
[file
][index
];
1321 case PROGRAM_STATE_VAR
:
1322 case PROGRAM_CONSTANT
:
1323 case PROGRAM_UNIFORM
:
1324 case PROGRAM_ENV_PARAM
:
1325 case PROGRAM_LOCAL_PARAM
:
1326 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1327 /* Take from the push constant buffer if possible. */
1328 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1329 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1331 /* Must be in the pull constant buffer then .*/
1332 assert(c
->vp
->use_const_buffer
);
1334 return get_reladdr_constant(c
, inst
, argIndex
);
1336 return get_constant(c
, inst
, argIndex
);
1338 case PROGRAM_ADDRESS
:
1340 return c
->regs
[file
][index
];
1342 case PROGRAM_UNDEFINED
:
1343 /* this is a normal case since we loop over all three src args */
1344 return brw_null_reg();
1346 case PROGRAM_WRITE_ONLY
:
1349 return brw_null_reg();
1354 * Return the brw reg for the given instruction's src argument.
1355 * Will return mangled results for SWZ op. The emit_swz() function
1356 * ignores this result and recalculates taking extended swizzles into
1359 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1360 const struct prog_instruction
*inst
,
1363 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1366 if (src
->File
== PROGRAM_UNDEFINED
)
1367 return brw_null_reg();
1369 reg
= get_src_reg(c
, inst
, argIndex
);
1371 /* Convert 3-bit swizzle to 2-bit.
1373 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1374 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1375 GET_SWZ(src
->Swizzle
, 1),
1376 GET_SWZ(src
->Swizzle
, 2),
1377 GET_SWZ(src
->Swizzle
, 3));
1379 /* Note this is ok for non-swizzle ARB_vp instructions */
1380 reg
.negate
= src
->Negate
? 1 : 0;
1388 * Get brw register for the given program dest register.
1390 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1391 struct prog_dst_register dst
)
1396 case PROGRAM_TEMPORARY
:
1397 case PROGRAM_OUTPUT
:
1398 /* register-indirect addressing is only 1x1, not VxH, for
1399 * destination regs. So, for RelAddr we'll return a temporary
1400 * for the dest and do a move of the result to the RelAddr
1401 * register after the instruction emit.
1406 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1407 reg
= c
->regs
[dst
.File
][dst
.Index
];
1410 case PROGRAM_ADDRESS
:
1411 assert(dst
.Index
== 0);
1412 reg
= c
->regs
[dst
.File
][dst
.Index
];
1414 case PROGRAM_UNDEFINED
:
1415 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1416 reg
= brw_null_reg();
1420 reg
= brw_null_reg();
1423 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1424 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1430 static void emit_swz( struct brw_vs_compile
*c
,
1432 const struct prog_instruction
*inst
)
1434 const GLuint argIndex
= 0;
1435 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1436 struct brw_compile
*p
= &c
->func
;
1437 GLuint zeros_mask
= 0;
1438 GLuint ones_mask
= 0;
1439 GLuint src_mask
= 0;
1441 bool need_tmp
= (src
.Negate
&&
1442 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1443 struct brw_reg tmp
= dst
;
1449 for (i
= 0; i
< 4; i
++) {
1450 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1451 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1470 /* Do src first, in case dst aliases src:
1473 struct brw_reg arg0
;
1475 arg0
= get_src_reg(c
, inst
, argIndex
);
1477 arg0
= brw_swizzle(arg0
,
1478 src_swz
[0], src_swz
[1],
1479 src_swz
[2], src_swz
[3]);
1481 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1485 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1488 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1491 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1494 brw_MOV(p
, dst
, tmp
);
1495 release_tmp(c
, tmp
);
1500 align_interleaved_urb_mlen(struct brw_context
*brw
, int mlen
)
1502 struct intel_context
*intel
= &brw
->intel
;
1504 if (intel
->gen
>= 6) {
1505 /* URB data written (does not include the message header reg) must
1506 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1507 * section 5.4.3.2.2: URB_INTERLEAVED.
1509 * URB entries are allocated on a multiple of 1024 bits, so an
1510 * extra 128 bits written here to make the end align to 256 is
1513 if ((mlen
% 2) != 1)
1521 * Post-vertex-program processing. Send the results to the URB.
1523 static void emit_vertex_write( struct brw_vs_compile
*c
)
1525 struct brw_compile
*p
= &c
->func
;
1526 struct brw_context
*brw
= p
->brw
;
1527 struct intel_context
*intel
= &brw
->intel
;
1528 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1531 GLuint len_vertex_header
= 2;
1536 if (c
->key
.copy_edgeflag
) {
1538 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1539 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1542 if (intel
->gen
< 6) {
1543 /* Build ndc coords */
1545 /* ndc = 1.0 / pos.w */
1546 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1547 /* ndc.xyz = pos * ndc */
1548 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1551 /* Update the header for point size, user clipping flags, and -ve rhw
1554 if (intel
->gen
>= 6) {
1555 struct brw_reg m1
= brw_message_reg(1);
1557 /* On gen6, m1 has each value in a separate dword, so we never
1558 * need to mess with a temporary for computing the m1 value.
1560 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1561 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1562 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1563 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1566 /* Set the user clip distances in dword 8-15. (m3-4)*/
1567 if (c
->key
.userclip_active
) {
1568 for (i
= 0; i
< c
->key
.nr_userclip_plane_consts
; i
++) {
1571 m
= brw_message_reg(3);
1573 m
= brw_message_reg(4);
1575 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 3))),pos
, c
->userplane
[i
]);
1578 } else if ((c
->prog_data
.outputs_written
&
1579 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1580 c
->key
.userclip_active
|| brw
->has_negative_rhw_bug
) {
1581 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1584 brw_MOV(p
, header1
, brw_imm_ud(0));
1586 brw_set_access_mode(p
, BRW_ALIGN_16
);
1588 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1589 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1590 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1591 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1592 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1593 header1
, brw_imm_ud(0x7ff<<8));
1596 for (i
= 0; i
< c
->key
.nr_userclip_plane_consts
; i
++) {
1597 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1598 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1599 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1600 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1603 /* i965 clipping workaround:
1604 * 1) Test for -ve rhw
1606 * set ndc = (0,0,0,0)
1609 * Later, clipping will detect ucp[6] and ensure the primitive is
1610 * clipped against all fixed planes.
1612 if (brw
->has_negative_rhw_bug
) {
1614 vec8(brw_null_reg()),
1616 brw_swizzle1(ndc
, 3),
1619 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1620 brw_MOV(p
, ndc
, brw_imm_f(0));
1621 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1624 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1625 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1626 brw_set_access_mode(p
, BRW_ALIGN_16
);
1628 release_tmp(c
, header1
);
1631 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1634 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1635 * of zeros followed by two sets of NDC coordinates:
1637 brw_set_access_mode(p
, BRW_ALIGN_1
);
1638 brw_set_acc_write_control(p
, 0);
1640 /* The VUE layout is documented in Volume 2a. */
1641 if (intel
->gen
>= 6) {
1642 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1643 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1644 * dword 4-7 (m2) is the 4D space position
1645 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1647 * m3 or 5 is the first vertex element data we fill, which is
1648 * the vertex position.
1650 brw_MOV(p
, brw_message_reg(2), pos
);
1651 len_vertex_header
= 1;
1652 if (c
->key
.userclip_active
)
1653 len_vertex_header
+= 2;
1654 } else if (intel
->gen
== 5) {
1655 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1656 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1657 * dword 4-7 (m2) is the ndc position (set above)
1658 * dword 8-11 (m3) of the vertex header is the 4D space position
1659 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1660 * m6 is a pad so that the vertex element data is aligned
1661 * m7 is the first vertex data we fill, which is the vertex position.
1663 brw_MOV(p
, brw_message_reg(2), ndc
);
1664 brw_MOV(p
, brw_message_reg(3), pos
);
1665 brw_MOV(p
, brw_message_reg(7), pos
);
1666 len_vertex_header
= 6;
1668 /* There are 8 dwords in VUE header pre-Ironlake:
1669 * dword 0-3 (m1) is indices, point width, clip flags.
1670 * dword 4-7 (m2) is ndc position (set above)
1672 * dword 8-11 (m3) is the first vertex data, which we always have be the
1675 brw_MOV(p
, brw_message_reg(2), ndc
);
1676 brw_MOV(p
, brw_message_reg(3), pos
);
1677 len_vertex_header
= 2;
1680 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1681 for (slot
= len_vertex_header
; slot
< c
->vue_map
.num_slots
; ++slot
) {
1682 if (slot
>= MAX_SLOTS_IN_FIRST_URB_WRITE
)
1686 int vert_result
= c
->vue_map
.slot_to_vert_result
[slot
];
1687 if (c
->regs
[PROGRAM_OUTPUT
][vert_result
].file
==
1688 BRW_GENERAL_REGISTER_FILE
) {
1689 brw_MOV(p
, brw_message_reg(mrf
),
1690 c
->regs
[PROGRAM_OUTPUT
][vert_result
]);
1694 eot
= (slot
>= c
->vue_map
.num_slots
);
1696 /* Message header, plus the (first part of the) VUE. */
1698 msg_len
= align_interleaved_urb_mlen(brw
, msg_len
);
1699 /* Any outputs beyond BRW_MAX_MRF should be in the second URB write */
1700 assert (msg_len
<= BRW_MAX_MRF
- 1);
1703 brw_null_reg(), /* dest */
1704 0, /* starting mrf reg nr */
1709 0, /* response len */
1711 eot
, /* writes complete */
1712 0, /* urb destination offset */
1713 BRW_URB_SWIZZLE_INTERLEAVE
);
1715 if (slot
< c
->vue_map
.num_slots
) {
1716 /* Not all of the vertex outputs/results fit into the MRF.
1717 * Move the overflowed attributes from the GRF to the MRF and
1718 * issue another brw_urb_WRITE().
1721 for (; slot
< c
->vue_map
.num_slots
; ++slot
) {
1722 int vert_result
= c
->vue_map
.slot_to_vert_result
[slot
];
1723 /* move from GRF to MRF */
1724 brw_MOV(p
, brw_message_reg(mrf
),
1725 c
->regs
[PROGRAM_OUTPUT
][vert_result
]);
1730 brw_null_reg(), /* dest */
1731 0, /* starting mrf reg nr */
1735 align_interleaved_urb_mlen(brw
, mrf
),
1736 0, /* response len */
1738 1, /* writes complete */
1739 MAX_SLOTS_IN_FIRST_URB_WRITE
/ 2, /* urb destination offset */
1740 BRW_URB_SWIZZLE_INTERLEAVE
);
1745 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1747 struct brw_compile
*p
= &c
->func
;
1748 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1750 if (p
->nr_insn
== 0)
1753 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1756 if (val
.negate
|| val
.abs
)
1759 switch (prev_insn
->header
.opcode
) {
1760 case BRW_OPCODE_MOV
:
1761 case BRW_OPCODE_MAC
:
1762 case BRW_OPCODE_MUL
:
1763 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1764 prev_insn
->header
.execution_size
== val
.width
&&
1765 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1766 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1767 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1768 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1769 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1770 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1780 get_predicate(const struct prog_instruction
*inst
)
1782 if (inst
->DstReg
.CondMask
== COND_TR
)
1783 return BRW_PREDICATE_NONE
;
1785 /* All of GLSL only produces predicates for COND_NE and one channel per
1786 * vector. Fail badly if someone starts doing something else, as it might
1787 * mean infinite looping or something.
1789 * We'd like to support all the condition codes, but our hardware doesn't
1790 * quite match the Mesa IR, which is modeled after the NV extensions. For
1791 * those, the instruction may update the condition codes or not, then any
1792 * later instruction may use one of those condition codes. For gen4, the
1793 * instruction may update the flags register based on one of the condition
1794 * codes output by the instruction, and then further instructions may
1795 * predicate on that. We can probably support this, but it won't
1796 * necessarily be easy.
1798 assert(inst
->DstReg
.CondMask
== COND_NE
);
1800 switch (inst
->DstReg
.CondSwizzle
) {
1802 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1804 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1806 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1808 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1810 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1811 inst
->DstReg
.CondMask
);
1812 return BRW_PREDICATE_NORMAL
;
1817 brw_vs_rescale_gl_fixed(struct brw_vs_compile
*c
)
1819 struct brw_compile
*p
= &c
->func
;
1822 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
1823 if (!(c
->prog_data
.inputs_read
& BITFIELD64_BIT(i
)))
1826 if (c
->key
.gl_fixed_input_size
[i
] != 0) {
1827 struct brw_reg reg
= c
->regs
[PROGRAM_INPUT
][i
];
1830 brw_writemask(reg
, (1 << c
->key
.gl_fixed_input_size
[i
]) - 1),
1831 reg
, brw_imm_f(1.0 / 65536.0));
1836 /* Emit the vertex program instructions here.
1838 void brw_old_vs_emit(struct brw_vs_compile
*c
)
1840 #define MAX_IF_DEPTH 32
1841 #define MAX_LOOP_DEPTH 32
1842 struct brw_compile
*p
= &c
->func
;
1843 struct brw_context
*brw
= p
->brw
;
1844 struct intel_context
*intel
= &brw
->intel
;
1845 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1846 GLuint insn
, loop_depth
= 0;
1847 struct brw_instruction
*loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1848 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1849 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1853 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1854 printf("vs-mesa:\n");
1855 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1860 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1861 brw_set_access_mode(p
, BRW_ALIGN_16
);
1862 if_depth_in_loop
[loop_depth
] = 0;
1864 brw_set_acc_write_control(p
, 1);
1866 for (insn
= 0; insn
< nr_insns
; insn
++) {
1868 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1870 /* Message registers can't be read, so copy the output into GRF
1871 * register if they are used in source registers
1873 for (i
= 0; i
< 3; i
++) {
1874 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1875 GLuint index
= src
->Index
;
1876 GLuint file
= src
->File
;
1877 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1878 c
->output_regs
[index
].used_in_src
= true;
1881 switch (inst
->Opcode
) {
1884 c
->needs_stack
= true;
1891 /* Static register allocation
1893 brw_vs_alloc_regs(c
);
1895 brw_vs_rescale_gl_fixed(c
);
1898 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1900 for (insn
= 0; insn
< nr_insns
; insn
++) {
1902 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1903 struct brw_reg args
[3], dst
;
1907 printf("%d: ", insn
);
1908 _mesa_print_instruction(inst
);
1911 /* Get argument regs. SWZ is special and does this itself.
1913 if (inst
->Opcode
!= OPCODE_SWZ
)
1914 for (i
= 0; i
< 3; i
++) {
1915 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1918 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
) {
1919 /* Can't just make get_arg "do the right thing" here because
1920 * other callers of get_arg and get_src_reg don't expect any
1921 * special behavior for the c->output_regs[index].used_in_src
1924 args
[i
] = c
->output_regs
[index
].reg
;
1925 args
[i
].dw1
.bits
.swizzle
=
1926 BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1927 GET_SWZ(src
->Swizzle
, 1),
1928 GET_SWZ(src
->Swizzle
, 2),
1929 GET_SWZ(src
->Swizzle
, 3));
1931 /* Note this is ok for non-swizzle ARB_vp instructions */
1932 args
[i
].negate
= src
->Negate
? 1 : 0;
1934 args
[i
] = get_arg(c
, inst
, i
);
1937 /* Get dest regs. Note that it is possible for a reg to be both
1938 * dst and arg, given the static allocation of registers. So
1939 * care needs to be taken emitting multi-operation instructions.
1941 index
= inst
->DstReg
.Index
;
1942 file
= inst
->DstReg
.File
;
1943 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1944 /* Can't just make get_dst "do the right thing" here because other
1945 * callers of get_dst don't expect any special behavior for the
1946 * c->output_regs[index].used_in_src case.
1948 dst
= brw_writemask(c
->output_regs
[index
].reg
, inst
->DstReg
.WriteMask
);
1950 dst
= get_dst(c
, inst
->DstReg
);
1952 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1953 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1954 inst
->SaturateMode
);
1957 switch (inst
->Opcode
) {
1959 args
[0].negate
= false;
1960 brw_MOV(p
, dst
, brw_abs(args
[0]));
1963 brw_ADD(p
, dst
, args
[0], args
[1]);
1966 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1969 brw_DP2(p
, dst
, args
[0], args
[1]);
1972 brw_DP3(p
, dst
, args
[0], args
[1]);
1975 brw_DP4(p
, dst
, args
[0], args
[1]);
1978 brw_DPH(p
, dst
, args
[0], args
[1]);
1981 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1984 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1987 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1990 emit_arl(p
, dst
, args
[0]);
1993 brw_RNDD(p
, dst
, args
[0]);
1996 brw_FRC(p
, dst
, args
[0]);
1999 unalias1(c
, dst
, args
[0], emit_log_noalias
);
2002 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2005 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
2008 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
2011 if (!accumulator_contains(c
, args
[2]))
2012 brw_MOV(p
, brw_acc_reg(), args
[2]);
2013 brw_MAC(p
, dst
, args
[0], args
[1]);
2016 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
2019 emit_max(p
, dst
, args
[0], args
[1]);
2022 emit_min(p
, dst
, args
[0], args
[1]);
2025 brw_MOV(p
, dst
, args
[0]);
2028 brw_MUL(p
, dst
, args
[0], args
[1]);
2031 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
2034 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2037 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, brw_abs(args
[0]), BRW_MATH_PRECISION_FULL
);
2041 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
2044 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2047 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
2050 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
2053 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
2056 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
2059 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
2062 unalias1(c
, dst
, args
[0], emit_sign
);
2065 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
2068 /* The args[0] value can't be used here as it won't have
2069 * correctly encoded the full swizzle:
2071 emit_swz(c
, dst
, inst
);
2074 /* round toward zero */
2075 brw_RNDZ(p
, dst
, args
[0]);
2078 emit_xpd(p
, dst
, args
[0], args
[1]);
2081 struct brw_instruction
*if_inst
= brw_IF(p
, BRW_EXECUTE_8
);
2082 /* Note that brw_IF smashes the predicate_control field. */
2083 if_inst
->header
.predicate_control
= get_predicate(inst
);
2084 if_depth_in_loop
[loop_depth
]++;
2088 clear_current_const(c
);
2092 clear_current_const(c
);
2094 if_depth_in_loop
[loop_depth
]--;
2096 case OPCODE_BGNLOOP
:
2097 clear_current_const(c
);
2098 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2099 if_depth_in_loop
[loop_depth
] = 0;
2102 brw_set_predicate_control(p
, get_predicate(inst
));
2103 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
2104 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2107 brw_set_predicate_control(p
, get_predicate(inst
));
2108 if (intel
->gen
>= 6) {
2109 gen6_CONT(p
, loop_inst
[loop_depth
- 1]);
2111 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
2113 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2116 case OPCODE_ENDLOOP
: {
2117 clear_current_const(c
);
2118 struct brw_instruction
*inst0
, *inst1
;
2123 if (intel
->gen
== 5)
2126 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2128 if (intel
->gen
< 6) {
2129 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2130 while (inst0
> loop_inst
[loop_depth
]) {
2132 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2133 inst0
->bits3
.if_else
.jump_count
== 0) {
2134 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2135 } else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2136 inst0
->bits3
.if_else
.jump_count
== 0) {
2137 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2145 brw_set_predicate_control(p
, get_predicate(inst
));
2146 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2147 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2150 brw_set_access_mode(p
, BRW_ALIGN_1
);
2151 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2152 brw_set_access_mode(p
, BRW_ALIGN_16
);
2153 brw_ADD(p
, get_addr_reg(stack_index
),
2154 get_addr_reg(stack_index
), brw_imm_d(4));
2155 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2156 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2159 brw_ADD(p
, get_addr_reg(stack_index
),
2160 get_addr_reg(stack_index
), brw_imm_d(-4));
2161 brw_set_access_mode(p
, BRW_ALIGN_1
);
2162 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2163 brw_set_access_mode(p
, BRW_ALIGN_16
);
2166 emit_vertex_write(c
);
2172 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2178 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2179 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2180 _mesa_opcode_string(inst
->Opcode
) :
2184 /* Set the predication update on the last instruction of the native
2185 * instruction sequence.
2187 * This would be problematic if it was set on a math instruction,
2188 * but that shouldn't be the case with the current GLSL compiler.
2190 if (inst
->CondUpdate
) {
2191 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2193 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2194 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2197 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2198 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2199 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2200 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2203 /* Result color clamping.
2205 * When destination register is an output register and
2206 * it's primary/secondary front/back color, we have to clamp
2207 * the result to [0,1]. This is done by enabling the
2208 * saturation bit for the last instruction.
2210 * We don't use brw_set_saturate() as it modifies
2211 * p->current->header.saturate, which affects all the subsequent
2212 * instructions. Instead, we directly modify the header
2213 * of the last (already stored) instruction.
2215 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
2216 c
->key
.clamp_vertex_color
) {
2217 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2218 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2219 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2220 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2221 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2225 if (inst
->DstReg
.RelAddr
) {
2226 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2227 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2228 move_to_reladdr_dst(c
, inst
, dst
);
2234 brw_resolve_cals(p
);
2239 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2242 printf("vs-native:\n");
2243 for (i
= 0; i
< p
->nr_insn
; i
++)
2244 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);