2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
, j
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
148 int vert_result_reoder
[VERT_RESULT_MAX
];
151 /* Determine whether to use a real constant buffer or use a block
152 * of GRF registers for constants. The later is faster but only
153 * works if everything fits in the GRF.
154 * XXX this heuristic/check may need some fine tuning...
156 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
157 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
158 c
->vp
->use_const_buffer
= GL_TRUE
;
160 c
->vp
->use_const_buffer
= GL_FALSE
;
162 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
164 /* r0 -- reserved as usual
166 c
->r0
= brw_vec8_grf(reg
, 0);
169 /* User clip planes from curbe:
171 if (c
->key
.nr_userclip
) {
172 if (intel
->gen
>= 6) {
173 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
174 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
175 (i
% 2) * 4), 0, 4, 1);
177 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
179 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
180 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
181 (i
% 2) * 4), 0, 4, 1);
183 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
188 /* Assign some (probably all) of the vertex program constants to
189 * the push constant buffer/CURBE.
191 * There's an obvious limit to the numer of push constants equal to
192 * the number of register available, and that number is smaller
193 * than the minimum maximum number of vertex program parameters, so
194 * support for pull constants is required if we overflow.
195 * Additionally, on gen6 the number of push constants is even
198 * When there's relative addressing, we don't know what range of
199 * Mesa IR registers can be accessed. And generally, when relative
200 * addressing is used we also have too many constants to load them
201 * all as push constants. So, we'll just support relative
202 * addressing out of the pull constant buffers, and try to load as
203 * many statically-accessed constants into the push constant buffer
206 if (intel
->gen
>= 6) {
207 /* We can only load 32 regs of push constants. */
208 max_constant
= 32 * 2 - c
->key
.nr_userclip
;
210 max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
213 /* constant_map maps from ParameterValues[] index to index in the
214 * push constant buffer, or -1 if it's only in the pull constant
217 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
219 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
221 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
224 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
225 if (inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
226 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
227 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
228 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
229 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) {
233 if (inst
->SrcReg
[arg
].RelAddr
) {
234 c
->vp
->use_const_buffer
= GL_TRUE
;
238 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
239 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
244 /* If we ran out of push constant space, then we'll also upload all
245 * constants through the pull constant buffer so that they can be
246 * accessed no matter what. For relative addressing (the common
247 * case) we need them all in place anyway.
249 if (constant
== max_constant
)
250 c
->vp
->use_const_buffer
= GL_TRUE
;
252 for (i
= 0; i
< constant
; i
++) {
253 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
257 reg
+= (constant
+ 1) / 2;
258 c
->prog_data
.curb_read_length
= reg
- 1;
259 c
->prog_data
.nr_params
= constant
* 4;
260 /* XXX 0 causes a bug elsewhere... */
261 if (intel
->gen
< 6 && c
->prog_data
.nr_params
== 0)
262 c
->prog_data
.nr_params
= 4;
264 /* Allocate input regs:
267 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
268 if (c
->prog_data
.inputs_read
& (1 << i
)) {
270 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
274 /* If there are no inputs, we'll still be reading one attribute's worth
275 * because it's required -- see urb_read_length setting.
277 if (c
->nr_inputs
== 0)
280 /* Allocate outputs. The non-position outputs go straight into message regs.
283 c
->first_output
= reg
;
284 c
->first_overflow_output
= 0;
286 if (intel
->gen
>= 6) {
288 if (c
->key
.nr_userclip
)
290 } else if (intel
->gen
== 5)
295 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
297 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
298 vert_result_reoder
[i
] = i
;
300 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
301 if (intel
->gen
>= 6 && c
->key
.two_side_color
) {
302 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL1
)) &&
303 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC1
))) {
304 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
));
305 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
));
307 } else if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
)) &&
308 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
)))
312 for (i
= 0; i
< bfc
; i
++) {
313 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 0] = VERT_RESULT_COL0
+ i
;
314 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 1] = VERT_RESULT_BFC0
+ i
;
317 for (i
= VERT_RESULT_COL0
+ bfc
* 2; i
< VERT_RESULT_BFC0
+ bfc
; i
++) {
318 vert_result_reoder
[i
] = i
- bfc
;
323 for (j
= 0; j
< VERT_RESULT_MAX
; j
++) {
324 i
= vert_result_reoder
[j
];
326 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
328 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
329 if (i
== VERT_RESULT_HPOS
) {
330 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
333 else if (i
== VERT_RESULT_PSIZ
) {
334 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
338 /* Two restrictions on our compute-to-MRF here. The
339 * message length for all SEND messages is restricted to
340 * [1,15], so we can't use mrf 15, as that means a length
343 * Additionally, URB writes are aligned to URB rows, so we
344 * need to put an even number of registers of URB data in
345 * each URB write so that the later write is aligned. A
346 * message length of 15 means 1 message header reg plus 14
349 * For attributes beyond the compute-to-MRF, we compute to
350 * GRFs and they will be written in the second URB_WRITE.
352 if (first_reladdr_output
> i
&& mrf
< 15) {
353 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
357 if (mrf
>= 15 && !c
->first_overflow_output
)
358 c
->first_overflow_output
= i
;
359 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
367 /* Allocate program temporaries:
369 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
370 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
374 /* Address reg(s). Don't try to use the internal address reg until
377 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
378 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
382 BRW_VERTICAL_STRIDE_8
,
384 BRW_HORIZONTAL_STRIDE_1
,
390 if (c
->vp
->use_const_buffer
) {
391 for (i
= 0; i
< 3; i
++) {
392 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
395 clear_current_const(c
);
398 for (i
= 0; i
< 128; i
++) {
399 if (c
->output_regs
[i
].used_in_src
) {
400 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
405 if (c
->needs_stack
) {
406 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
410 /* Some opcodes need an internal temporary:
413 c
->last_tmp
= reg
; /* for allocation purposes */
415 /* Each input reg holds data from two vertices. The
416 * urb_read_length is the number of registers read from *each*
417 * vertex urb, so is half the amount:
419 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
420 /* Setting this field to 0 leads to undefined behavior according to the
421 * the VS_STATE docs. Our VUEs will always have at least one attribute
422 * sitting in them, even if it's padding.
424 if (c
->prog_data
.urb_read_length
== 0)
425 c
->prog_data
.urb_read_length
= 1;
427 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
428 * them to fit the biggest thing they need to.
430 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
432 /* See emit_vertex_write() for where the VUE's overhead on top of the
433 * attributes comes from.
435 if (intel
->gen
>= 7) {
437 if (c
->key
.nr_userclip
)
440 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
441 * number of 64-byte (512-bit) units.
443 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 3) / 4;
444 } else if (intel
->gen
== 6) {
446 if (c
->key
.nr_userclip
)
449 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
450 * number of 128-byte (1024-bit) units.
452 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
453 } else if (intel
->gen
== 5)
454 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
455 * number of 64-byte (512-bit) units.
457 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
459 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
461 c
->prog_data
.total_grf
= reg
;
463 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
464 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
465 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
466 printf("%s reg = %d\n", __FUNCTION__
, reg
);
472 * If an instruction uses a temp reg both as a src and the dest, we
473 * sometimes need to allocate an intermediate temporary.
475 static void unalias1( struct brw_vs_compile
*c
,
478 void (*func
)( struct brw_vs_compile
*,
482 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
483 struct brw_compile
*p
= &c
->func
;
484 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
486 brw_MOV(p
, dst
, tmp
);
496 * Checkes if 2-operand instruction needs an intermediate temporary.
498 static void unalias2( struct brw_vs_compile
*c
,
502 void (*func
)( struct brw_vs_compile
*,
507 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
508 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
509 struct brw_compile
*p
= &c
->func
;
510 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
511 func(c
, tmp
, arg0
, arg1
);
512 brw_MOV(p
, dst
, tmp
);
516 func(c
, dst
, arg0
, arg1
);
522 * Checkes if 3-operand instruction needs an intermediate temporary.
524 static void unalias3( struct brw_vs_compile
*c
,
529 void (*func
)( struct brw_vs_compile
*,
535 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
536 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
537 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
538 struct brw_compile
*p
= &c
->func
;
539 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
540 func(c
, tmp
, arg0
, arg1
, arg2
);
541 brw_MOV(p
, dst
, tmp
);
545 func(c
, dst
, arg0
, arg1
, arg2
);
549 static void emit_sop( struct brw_vs_compile
*c
,
555 struct brw_compile
*p
= &c
->func
;
557 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
558 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
559 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
560 brw_set_predicate_control_flag_value(p
, 0xff);
563 static void emit_seq( struct brw_vs_compile
*c
,
566 struct brw_reg arg1
)
568 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
571 static void emit_sne( struct brw_vs_compile
*c
,
574 struct brw_reg arg1
)
576 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
578 static void emit_slt( struct brw_vs_compile
*c
,
581 struct brw_reg arg1
)
583 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
586 static void emit_sle( struct brw_vs_compile
*c
,
589 struct brw_reg arg1
)
591 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
594 static void emit_sgt( struct brw_vs_compile
*c
,
597 struct brw_reg arg1
)
599 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
602 static void emit_sge( struct brw_vs_compile
*c
,
605 struct brw_reg arg1
)
607 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
610 static void emit_cmp( struct brw_compile
*p
,
614 struct brw_reg arg2
)
616 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
617 brw_SEL(p
, dst
, arg1
, arg2
);
618 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
621 static void emit_sign(struct brw_vs_compile
*c
,
625 struct brw_compile
*p
= &c
->func
;
627 brw_MOV(p
, dst
, brw_imm_f(0));
629 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
630 brw_MOV(p
, dst
, brw_imm_f(-1.0));
631 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
633 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
634 brw_MOV(p
, dst
, brw_imm_f(1.0));
635 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
638 static void emit_max( struct brw_compile
*p
,
641 struct brw_reg arg1
)
643 struct intel_context
*intel
= &p
->brw
->intel
;
645 if (intel
->gen
>= 6) {
646 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
647 brw_SEL(p
, dst
, arg0
, arg1
);
648 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
649 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
651 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
652 brw_SEL(p
, dst
, arg0
, arg1
);
653 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
657 static void emit_min( struct brw_compile
*p
,
660 struct brw_reg arg1
)
662 struct intel_context
*intel
= &p
->brw
->intel
;
664 if (intel
->gen
>= 6) {
665 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
666 brw_SEL(p
, dst
, arg0
, arg1
);
667 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
668 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
670 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
671 brw_SEL(p
, dst
, arg0
, arg1
);
672 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
676 static void emit_arl(struct brw_compile
*p
,
680 struct intel_context
*intel
= &p
->brw
->intel
;
682 if (intel
->gen
>= 6) {
683 struct brw_reg dst_f
= retype(dst
, BRW_REGISTER_TYPE_F
);
685 brw_RNDD(p
, dst_f
, src
);
686 brw_MOV(p
, dst
, dst_f
);
688 brw_RNDD(p
, dst
, src
);
692 static void emit_math1_gen4(struct brw_vs_compile
*c
,
698 /* There are various odd behaviours with SEND on the simulator. In
699 * addition there are documented issues with the fact that the GEN4
700 * processor doesn't do dependency control properly on SEND
701 * results. So, on balance, this kludge to get around failures
702 * with writemasked math results looks like it might be necessary
703 * whether that turns out to be a simulator bug or not:
705 struct brw_compile
*p
= &c
->func
;
706 struct brw_reg tmp
= dst
;
707 GLboolean need_tmp
= GL_FALSE
;
709 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
710 dst
.dw1
.bits
.writemask
!= 0xf)
719 BRW_MATH_SATURATE_NONE
,
722 BRW_MATH_DATA_SCALAR
,
726 brw_MOV(p
, dst
, tmp
);
732 emit_math1_gen6(struct brw_vs_compile
*c
,
738 struct brw_compile
*p
= &c
->func
;
739 struct brw_reg tmp_src
, tmp_dst
;
741 /* Something is strange on gen6 math in 16-wide mode, though the
742 * docs say it's supposed to work. Punt to using align1 mode,
743 * which doesn't do writemasking and swizzles.
745 tmp_src
= get_tmp(c
);
746 tmp_dst
= get_tmp(c
);
748 brw_MOV(p
, tmp_src
, arg0
);
750 brw_set_access_mode(p
, BRW_ALIGN_1
);
754 BRW_MATH_SATURATE_NONE
,
757 BRW_MATH_DATA_SCALAR
,
759 brw_set_access_mode(p
, BRW_ALIGN_16
);
761 brw_MOV(p
, dst
, tmp_dst
);
763 release_tmp(c
, tmp_src
);
764 release_tmp(c
, tmp_dst
);
768 emit_math1(struct brw_vs_compile
*c
,
774 struct brw_compile
*p
= &c
->func
;
775 struct intel_context
*intel
= &p
->brw
->intel
;
778 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
780 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
783 static void emit_math2_gen4( struct brw_vs_compile
*c
,
790 struct brw_compile
*p
= &c
->func
;
791 struct brw_reg tmp
= dst
;
792 GLboolean need_tmp
= GL_FALSE
;
794 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
795 dst
.dw1
.bits
.writemask
!= 0xf)
801 brw_MOV(p
, brw_message_reg(3), arg1
);
806 BRW_MATH_SATURATE_NONE
,
809 BRW_MATH_DATA_SCALAR
,
813 brw_MOV(p
, dst
, tmp
);
818 static void emit_math2_gen6( struct brw_vs_compile
*c
,
825 struct brw_compile
*p
= &c
->func
;
826 struct brw_reg tmp_src0
, tmp_src1
, tmp_dst
;
828 tmp_src0
= get_tmp(c
);
829 tmp_src1
= get_tmp(c
);
830 tmp_dst
= get_tmp(c
);
832 brw_MOV(p
, tmp_src0
, arg0
);
833 brw_MOV(p
, tmp_src1
, arg1
);
835 brw_set_access_mode(p
, BRW_ALIGN_1
);
841 brw_set_access_mode(p
, BRW_ALIGN_16
);
843 brw_MOV(p
, dst
, tmp_dst
);
845 release_tmp(c
, tmp_src0
);
846 release_tmp(c
, tmp_src1
);
847 release_tmp(c
, tmp_dst
);
850 static void emit_math2( struct brw_vs_compile
*c
,
857 struct brw_compile
*p
= &c
->func
;
858 struct intel_context
*intel
= &p
->brw
->intel
;
861 emit_math2_gen6(c
, function
, dst
, arg0
, arg1
, precision
);
863 emit_math2_gen4(c
, function
, dst
, arg0
, arg1
, precision
);
866 static void emit_exp_noalias( struct brw_vs_compile
*c
,
868 struct brw_reg arg0
)
870 struct brw_compile
*p
= &c
->func
;
873 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
874 struct brw_reg tmp
= get_tmp(c
);
875 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
877 /* tmp_d = floor(arg0.x) */
878 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
880 /* result[0] = 2.0 ^ tmp */
882 /* Adjust exponent for floating point:
885 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
887 /* Install exponent and sign.
888 * Excess drops off the edge:
890 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
891 tmp_d
, brw_imm_d(23));
896 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
897 /* result[1] = arg0.x - floor(arg0.x) */
898 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
901 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
902 /* As with the LOG instruction, we might be better off just
903 * doing a taylor expansion here, seeing as we have to do all
906 * If mathbox partial precision is too low, consider also:
907 * result[3] = result[0] * EXP(result[1])
910 BRW_MATH_FUNCTION_EXP
,
911 brw_writemask(dst
, WRITEMASK_Z
),
912 brw_swizzle1(arg0
, 0),
913 BRW_MATH_PRECISION_FULL
);
916 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
917 /* result[3] = 1.0; */
918 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
923 static void emit_log_noalias( struct brw_vs_compile
*c
,
925 struct brw_reg arg0
)
927 struct brw_compile
*p
= &c
->func
;
928 struct brw_reg tmp
= dst
;
929 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
930 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
931 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
932 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
936 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
939 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
942 * These almost look likey they could be joined up, but not really
945 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
946 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
948 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
950 brw_writemask(tmp_ud
, WRITEMASK_X
),
951 brw_swizzle1(arg0_ud
, 0),
952 brw_imm_ud((1U<<31)-1));
955 brw_writemask(tmp_ud
, WRITEMASK_X
),
960 brw_writemask(tmp
, WRITEMASK_X
),
961 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
965 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
967 brw_writemask(tmp_ud
, WRITEMASK_Y
),
968 brw_swizzle1(arg0_ud
, 0),
969 brw_imm_ud((1<<23)-1));
972 brw_writemask(tmp_ud
, WRITEMASK_Y
),
974 brw_imm_ud(127<<23));
977 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
978 /* result[2] = result[0] + LOG2(result[1]); */
980 /* Why bother? The above is just a hint how to do this with a
981 * taylor series. Maybe we *should* use a taylor series as by
982 * the time all the above has been done it's almost certainly
983 * quicker than calling the mathbox, even with low precision.
986 * - result[0] + mathbox.LOG2(result[1])
987 * - mathbox.LOG2(arg0.x)
988 * - result[0] + inline_taylor_approx(result[1])
991 BRW_MATH_FUNCTION_LOG
,
992 brw_writemask(tmp
, WRITEMASK_Z
),
993 brw_swizzle1(tmp
, 1),
994 BRW_MATH_PRECISION_FULL
);
997 brw_writemask(tmp
, WRITEMASK_Z
),
998 brw_swizzle1(tmp
, 2),
999 brw_swizzle1(tmp
, 0));
1002 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
1003 /* result[3] = 1.0; */
1004 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
1008 brw_MOV(p
, dst
, tmp
);
1009 release_tmp(c
, tmp
);
1014 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1016 static void emit_dst_noalias( struct brw_vs_compile
*c
,
1018 struct brw_reg arg0
,
1019 struct brw_reg arg1
)
1021 struct brw_compile
*p
= &c
->func
;
1023 /* There must be a better way to do this:
1025 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
1026 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
1027 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
1028 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
1029 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
1030 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
1031 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
1032 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
1036 static void emit_xpd( struct brw_compile
*p
,
1041 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
1042 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
1046 static void emit_lit_noalias( struct brw_vs_compile
*c
,
1048 struct brw_reg arg0
)
1050 struct brw_compile
*p
= &c
->func
;
1051 struct brw_reg tmp
= dst
;
1052 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1057 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
1058 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
1060 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1061 * to get all channels active inside the IF. In the clipping code
1062 * we run with NoMask, so it's not an option and we can use
1063 * BRW_EXECUTE_1 for all comparisions.
1065 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
1066 brw_IF(p
, BRW_EXECUTE_8
);
1068 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
1070 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
1071 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
1072 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1075 BRW_MATH_FUNCTION_POW
,
1076 brw_writemask(dst
, WRITEMASK_Z
),
1077 brw_swizzle1(tmp
, 2),
1078 brw_swizzle1(arg0
, 3),
1079 BRW_MATH_PRECISION_PARTIAL
);
1083 release_tmp(c
, tmp
);
1086 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
1088 struct brw_reg arg0
,
1089 struct brw_reg arg1
,
1090 struct brw_reg arg2
)
1092 struct brw_compile
*p
= &c
->func
;
1094 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
1095 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
1096 brw_MAC(p
, dst
, arg0
, arg1
);
1099 /** 3 or 4-component vector normalization */
1100 static void emit_nrm( struct brw_vs_compile
*c
,
1102 struct brw_reg arg0
,
1105 struct brw_compile
*p
= &c
->func
;
1106 struct brw_reg tmp
= get_tmp(c
);
1108 /* tmp = dot(arg0, arg0) */
1110 brw_DP3(p
, tmp
, arg0
, arg0
);
1112 brw_DP4(p
, tmp
, arg0
, arg0
);
1114 /* tmp = 1 / sqrt(tmp) */
1115 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
1117 /* dst = arg0 * tmp */
1118 brw_MUL(p
, dst
, arg0
, tmp
);
1120 release_tmp(c
, tmp
);
1124 static struct brw_reg
1125 get_constant(struct brw_vs_compile
*c
,
1126 const struct prog_instruction
*inst
,
1129 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1130 struct brw_compile
*p
= &c
->func
;
1131 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1133 assert(argIndex
< 3);
1135 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1136 /* Keep track of the last constant loaded in this slot, for reuse. */
1137 c
->current_const
[argIndex
].index
= src
->Index
;
1140 printf(" fetch const[%d] for arg %d into reg %d\n",
1141 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1143 /* need to fetch the constant now */
1145 const_reg
, /* writeback dest */
1146 16 * src
->Index
, /* byte offset */
1147 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1151 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1152 const_reg
= stride(const_reg
, 0, 4, 1);
1153 const_reg
.subnr
= 0;
1158 static struct brw_reg
1159 get_reladdr_constant(struct brw_vs_compile
*c
,
1160 const struct prog_instruction
*inst
,
1163 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1164 struct brw_compile
*p
= &c
->func
;
1165 struct brw_context
*brw
= p
->brw
;
1166 struct intel_context
*intel
= &brw
->intel
;
1167 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1168 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1171 assert(argIndex
< 3);
1173 /* Can't reuse a reladdr constant load. */
1174 c
->current_const
[argIndex
].index
= -1;
1177 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1178 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1181 if (intel
->gen
>= 6) {
1182 offset
= src
->Index
;
1184 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1185 brw_MUL(p
, byte_addr_reg
, addr_reg
, brw_imm_d(16));
1186 addr_reg
= byte_addr_reg
;
1187 offset
= 16 * src
->Index
;
1190 /* fetch the first vec4 */
1191 brw_dp_READ_4_vs_relative(p
,
1195 SURF_INDEX_VERT_CONST_BUFFER
);
1202 /* TODO: relative addressing!
1204 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1205 gl_register_file file
,
1209 case PROGRAM_TEMPORARY
:
1211 case PROGRAM_OUTPUT
:
1212 assert(c
->regs
[file
][index
].nr
!= 0);
1213 return c
->regs
[file
][index
];
1214 case PROGRAM_STATE_VAR
:
1215 case PROGRAM_CONSTANT
:
1216 case PROGRAM_UNIFORM
:
1217 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1218 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1219 case PROGRAM_ADDRESS
:
1221 return c
->regs
[file
][index
];
1223 case PROGRAM_UNDEFINED
: /* undef values */
1224 return brw_null_reg();
1226 case PROGRAM_LOCAL_PARAM
:
1227 case PROGRAM_ENV_PARAM
:
1228 case PROGRAM_WRITE_ONLY
:
1231 return brw_null_reg();
1237 * Indirect addressing: get reg[[arg] + offset].
1239 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1244 struct brw_compile
*p
= &c
->func
;
1245 struct brw_reg tmp
= get_tmp(c
);
1246 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1247 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1248 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1249 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1250 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1252 /* Set the vertical stride on the register access so that the first
1253 * 4 components come from a0.0 and the second 4 from a0.1.
1255 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1258 brw_push_insn_state(p
);
1259 brw_set_access_mode(p
, BRW_ALIGN_1
);
1261 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1262 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1264 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1265 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1267 brw_MOV(p
, tmp
, indirect
);
1269 brw_pop_insn_state(p
);
1272 /* NOTE: tmp not released */
1277 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1278 const struct prog_instruction
*inst
,
1281 struct brw_compile
*p
= &c
->func
;
1283 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1284 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1285 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1286 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1287 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1288 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1290 /* Because destination register indirect addressing can only use
1291 * one index, we'll write each vertex's vec4 value separately.
1293 val
.width
= BRW_WIDTH_4
;
1294 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1296 brw_push_insn_state(p
);
1297 brw_set_access_mode(p
, BRW_ALIGN_1
);
1299 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1300 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1301 brw_MOV(p
, indirect
, val
);
1303 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1304 brw_ADD(p
, brw_address_reg(0), acc
,
1305 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1306 brw_MOV(p
, indirect
, suboffset(val
, 4));
1308 brw_pop_insn_state(p
);
1312 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1313 * TODO: relative addressing!
1315 static struct brw_reg
1316 get_src_reg( struct brw_vs_compile
*c
,
1317 const struct prog_instruction
*inst
,
1320 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1321 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1322 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1324 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1325 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1327 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1331 return brw_imm_f(0.0f
);
1332 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1337 return brw_imm_f(-1.0F
);
1339 return brw_imm_f(1.0F
);
1340 } else if (src
->File
== PROGRAM_CONSTANT
) {
1341 const struct gl_program_parameter_list
*params
;
1345 switch (src
->Swizzle
) {
1360 if (component
>= 0) {
1361 params
= c
->vp
->program
.Base
.Parameters
;
1362 f
= params
->ParameterValues
[src
->Index
][component
].f
;
1368 return brw_imm_f(f
);
1374 case PROGRAM_TEMPORARY
:
1376 case PROGRAM_OUTPUT
:
1378 return deref(c
, c
->regs
[file
][0], index
, 32);
1381 assert(c
->regs
[file
][index
].nr
!= 0);
1382 return c
->regs
[file
][index
];
1385 case PROGRAM_STATE_VAR
:
1386 case PROGRAM_CONSTANT
:
1387 case PROGRAM_UNIFORM
:
1388 case PROGRAM_ENV_PARAM
:
1389 case PROGRAM_LOCAL_PARAM
:
1390 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1391 /* Take from the push constant buffer if possible. */
1392 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1393 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1395 /* Must be in the pull constant buffer then .*/
1396 assert(c
->vp
->use_const_buffer
);
1398 return get_reladdr_constant(c
, inst
, argIndex
);
1400 return get_constant(c
, inst
, argIndex
);
1402 case PROGRAM_ADDRESS
:
1404 return c
->regs
[file
][index
];
1406 case PROGRAM_UNDEFINED
:
1407 /* this is a normal case since we loop over all three src args */
1408 return brw_null_reg();
1410 case PROGRAM_WRITE_ONLY
:
1413 return brw_null_reg();
1418 * Return the brw reg for the given instruction's src argument.
1419 * Will return mangled results for SWZ op. The emit_swz() function
1420 * ignores this result and recalculates taking extended swizzles into
1423 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1424 const struct prog_instruction
*inst
,
1427 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1430 if (src
->File
== PROGRAM_UNDEFINED
)
1431 return brw_null_reg();
1433 reg
= get_src_reg(c
, inst
, argIndex
);
1435 /* Convert 3-bit swizzle to 2-bit.
1437 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1438 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1439 GET_SWZ(src
->Swizzle
, 1),
1440 GET_SWZ(src
->Swizzle
, 2),
1441 GET_SWZ(src
->Swizzle
, 3));
1443 /* Note this is ok for non-swizzle ARB_vp instructions */
1444 reg
.negate
= src
->Negate
? 1 : 0;
1452 * Get brw register for the given program dest register.
1454 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1455 struct prog_dst_register dst
)
1460 case PROGRAM_TEMPORARY
:
1461 case PROGRAM_OUTPUT
:
1462 /* register-indirect addressing is only 1x1, not VxH, for
1463 * destination regs. So, for RelAddr we'll return a temporary
1464 * for the dest and do a move of the result to the RelAddr
1465 * register after the instruction emit.
1470 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1471 reg
= c
->regs
[dst
.File
][dst
.Index
];
1474 case PROGRAM_ADDRESS
:
1475 assert(dst
.Index
== 0);
1476 reg
= c
->regs
[dst
.File
][dst
.Index
];
1478 case PROGRAM_UNDEFINED
:
1479 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1480 reg
= brw_null_reg();
1484 reg
= brw_null_reg();
1487 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1488 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1494 static void emit_swz( struct brw_vs_compile
*c
,
1496 const struct prog_instruction
*inst
)
1498 const GLuint argIndex
= 0;
1499 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1500 struct brw_compile
*p
= &c
->func
;
1501 GLuint zeros_mask
= 0;
1502 GLuint ones_mask
= 0;
1503 GLuint src_mask
= 0;
1505 GLboolean need_tmp
= (src
.Negate
&&
1506 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1507 struct brw_reg tmp
= dst
;
1513 for (i
= 0; i
< 4; i
++) {
1514 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1515 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1534 /* Do src first, in case dst aliases src:
1537 struct brw_reg arg0
;
1539 arg0
= get_src_reg(c
, inst
, argIndex
);
1541 arg0
= brw_swizzle(arg0
,
1542 src_swz
[0], src_swz
[1],
1543 src_swz
[2], src_swz
[3]);
1545 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1549 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1552 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1555 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1558 brw_MOV(p
, dst
, tmp
);
1559 release_tmp(c
, tmp
);
1564 align_interleaved_urb_mlen(struct brw_context
*brw
, int mlen
)
1566 struct intel_context
*intel
= &brw
->intel
;
1568 if (intel
->gen
>= 6) {
1569 /* URB data written (does not include the message header reg) must
1570 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1571 * section 5.4.3.2.2: URB_INTERLEAVED.
1573 * URB entries are allocated on a multiple of 1024 bits, so an
1574 * extra 128 bits written here to make the end align to 256 is
1577 if ((mlen
% 2) != 1)
1585 * Post-vertex-program processing. Send the results to the URB.
1587 static void emit_vertex_write( struct brw_vs_compile
*c
)
1589 struct brw_compile
*p
= &c
->func
;
1590 struct brw_context
*brw
= p
->brw
;
1591 struct intel_context
*intel
= &brw
->intel
;
1592 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1595 GLuint len_vertex_header
= 2;
1599 if (c
->key
.copy_edgeflag
) {
1601 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1602 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1605 if (intel
->gen
< 6) {
1606 /* Build ndc coords */
1608 /* ndc = 1.0 / pos.w */
1609 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1610 /* ndc.xyz = pos * ndc */
1611 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1614 /* Update the header for point size, user clipping flags, and -ve rhw
1617 if (intel
->gen
>= 6) {
1618 struct brw_reg m1
= brw_message_reg(1);
1620 /* On gen6, m1 has each value in a separate dword, so we never
1621 * need to mess with a temporary for computing the m1 value.
1623 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1624 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1625 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1626 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1629 /* Set the user clip distances in dword 8-15. (m3-4)*/
1630 if (c
->key
.nr_userclip
) {
1631 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1634 m
= brw_message_reg(3);
1636 m
= brw_message_reg(4);
1638 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 3))),pos
, c
->userplane
[i
]);
1641 } else if ((c
->prog_data
.outputs_written
&
1642 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1643 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1644 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1647 brw_MOV(p
, header1
, brw_imm_ud(0));
1649 brw_set_access_mode(p
, BRW_ALIGN_16
);
1651 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1652 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1653 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1654 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1655 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1656 header1
, brw_imm_ud(0x7ff<<8));
1659 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1660 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1661 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1662 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1663 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1666 /* i965 clipping workaround:
1667 * 1) Test for -ve rhw
1669 * set ndc = (0,0,0,0)
1672 * Later, clipping will detect ucp[6] and ensure the primitive is
1673 * clipped against all fixed planes.
1675 if (brw
->has_negative_rhw_bug
) {
1677 vec8(brw_null_reg()),
1679 brw_swizzle1(ndc
, 3),
1682 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1683 brw_MOV(p
, ndc
, brw_imm_f(0));
1684 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1687 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1688 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1689 brw_set_access_mode(p
, BRW_ALIGN_16
);
1691 release_tmp(c
, header1
);
1694 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1697 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1698 * of zeros followed by two sets of NDC coordinates:
1700 brw_set_access_mode(p
, BRW_ALIGN_1
);
1701 brw_set_acc_write_control(p
, 0);
1703 /* The VUE layout is documented in Volume 2a. */
1704 if (intel
->gen
>= 6) {
1705 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1706 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1707 * dword 4-7 (m2) is the 4D space position
1708 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1710 * m3 or 5 is the first vertex element data we fill, which is
1711 * the vertex position.
1713 brw_MOV(p
, brw_message_reg(2), pos
);
1714 len_vertex_header
= 1;
1715 if (c
->key
.nr_userclip
> 0)
1716 len_vertex_header
+= 2;
1717 } else if (intel
->gen
== 5) {
1718 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1719 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1720 * dword 4-7 (m2) is the ndc position (set above)
1721 * dword 8-11 (m3) of the vertex header is the 4D space position
1722 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1723 * m6 is a pad so that the vertex element data is aligned
1724 * m7 is the first vertex data we fill, which is the vertex position.
1726 brw_MOV(p
, brw_message_reg(2), ndc
);
1727 brw_MOV(p
, brw_message_reg(3), pos
);
1728 brw_MOV(p
, brw_message_reg(7), pos
);
1729 len_vertex_header
= 6;
1731 /* There are 8 dwords in VUE header pre-Ironlake:
1732 * dword 0-3 (m1) is indices, point width, clip flags.
1733 * dword 4-7 (m2) is ndc position (set above)
1735 * dword 8-11 (m3) is the first vertex data, which we always have be the
1738 brw_MOV(p
, brw_message_reg(2), ndc
);
1739 brw_MOV(p
, brw_message_reg(3), pos
);
1740 len_vertex_header
= 2;
1743 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1744 next_mrf
= 2 + len_vertex_header
;
1745 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1746 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1748 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1750 if (i
== VERT_RESULT_PSIZ
)
1753 if (i
>= VERT_RESULT_TEX0
&&
1754 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1755 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1757 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1758 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1762 eot
= (c
->first_overflow_output
== 0);
1764 /* Message header, plus VUE header, plus the (first set of) outputs. */
1765 msg_len
= 1 + len_vertex_header
+ c
->nr_outputs
;
1766 msg_len
= align_interleaved_urb_mlen(brw
, msg_len
);
1767 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1768 msg_len
= MIN2(msg_len
, (BRW_MAX_MRF
- 1)),
1771 brw_null_reg(), /* dest */
1772 0, /* starting mrf reg nr */
1777 0, /* response len */
1779 eot
, /* writes complete */
1780 0, /* urb destination offset */
1781 BRW_URB_SWIZZLE_INTERLEAVE
);
1783 if (c
->first_overflow_output
> 0) {
1784 /* Not all of the vertex outputs/results fit into the MRF.
1785 * Move the overflowed attributes from the GRF to the MRF and
1786 * issue another brw_urb_WRITE().
1789 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1790 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1791 /* move from GRF to MRF */
1792 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1798 brw_null_reg(), /* dest */
1799 0, /* starting mrf reg nr */
1803 align_interleaved_urb_mlen(brw
, mrf
),
1804 0, /* response len */
1806 1, /* writes complete */
1807 14 / 2, /* urb destination offset */
1808 BRW_URB_SWIZZLE_INTERLEAVE
);
1813 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1815 struct brw_compile
*p
= &c
->func
;
1816 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1818 if (p
->nr_insn
== 0)
1821 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1824 if (val
.negate
|| val
.abs
)
1827 switch (prev_insn
->header
.opcode
) {
1828 case BRW_OPCODE_MOV
:
1829 case BRW_OPCODE_MAC
:
1830 case BRW_OPCODE_MUL
:
1831 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1832 prev_insn
->header
.execution_size
== val
.width
&&
1833 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1834 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1835 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1836 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1837 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1838 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1848 get_predicate(const struct prog_instruction
*inst
)
1850 if (inst
->DstReg
.CondMask
== COND_TR
)
1851 return BRW_PREDICATE_NONE
;
1853 /* All of GLSL only produces predicates for COND_NE and one channel per
1854 * vector. Fail badly if someone starts doing something else, as it might
1855 * mean infinite looping or something.
1857 * We'd like to support all the condition codes, but our hardware doesn't
1858 * quite match the Mesa IR, which is modeled after the NV extensions. For
1859 * those, the instruction may update the condition codes or not, then any
1860 * later instruction may use one of those condition codes. For gen4, the
1861 * instruction may update the flags register based on one of the condition
1862 * codes output by the instruction, and then further instructions may
1863 * predicate on that. We can probably support this, but it won't
1864 * necessarily be easy.
1866 assert(inst
->DstReg
.CondMask
== COND_NE
);
1868 switch (inst
->DstReg
.CondSwizzle
) {
1870 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1872 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1874 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1876 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1878 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1879 inst
->DstReg
.CondMask
);
1880 return BRW_PREDICATE_NORMAL
;
1885 brw_vs_rescale_gl_fixed(struct brw_vs_compile
*c
)
1887 struct brw_compile
*p
= &c
->func
;
1890 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
1891 if (!(c
->prog_data
.inputs_read
& (1 << i
)))
1894 if (c
->key
.gl_fixed_input_size
[i
] != 0) {
1895 struct brw_reg reg
= c
->regs
[PROGRAM_INPUT
][i
];
1898 brw_writemask(reg
, (1 << c
->key
.gl_fixed_input_size
[i
]) - 1),
1899 reg
, brw_imm_f(1.0 / 65536.0));
1904 /* Emit the vertex program instructions here.
1906 void brw_old_vs_emit(struct brw_vs_compile
*c
)
1908 #define MAX_IF_DEPTH 32
1909 #define MAX_LOOP_DEPTH 32
1910 struct brw_compile
*p
= &c
->func
;
1911 struct brw_context
*brw
= p
->brw
;
1912 struct intel_context
*intel
= &brw
->intel
;
1913 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1914 GLuint insn
, loop_depth
= 0;
1915 struct brw_instruction
*loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1916 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1917 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1921 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1922 printf("vs-mesa:\n");
1923 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1928 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1929 brw_set_access_mode(p
, BRW_ALIGN_16
);
1930 if_depth_in_loop
[loop_depth
] = 0;
1932 brw_set_acc_write_control(p
, 1);
1934 for (insn
= 0; insn
< nr_insns
; insn
++) {
1936 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1938 /* Message registers can't be read, so copy the output into GRF
1939 * register if they are used in source registers
1941 for (i
= 0; i
< 3; i
++) {
1942 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1943 GLuint index
= src
->Index
;
1944 GLuint file
= src
->File
;
1945 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1946 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1949 switch (inst
->Opcode
) {
1952 c
->needs_stack
= GL_TRUE
;
1959 /* Static register allocation
1961 brw_vs_alloc_regs(c
);
1963 brw_vs_rescale_gl_fixed(c
);
1966 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1968 for (insn
= 0; insn
< nr_insns
; insn
++) {
1970 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1971 struct brw_reg args
[3], dst
;
1975 printf("%d: ", insn
);
1976 _mesa_print_instruction(inst
);
1979 /* Get argument regs. SWZ is special and does this itself.
1981 if (inst
->Opcode
!= OPCODE_SWZ
)
1982 for (i
= 0; i
< 3; i
++) {
1983 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1986 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
) {
1987 /* Can't just make get_arg "do the right thing" here because
1988 * other callers of get_arg and get_src_reg don't expect any
1989 * special behavior for the c->output_regs[index].used_in_src
1992 args
[i
] = c
->output_regs
[index
].reg
;
1993 args
[i
].dw1
.bits
.swizzle
=
1994 BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1995 GET_SWZ(src
->Swizzle
, 1),
1996 GET_SWZ(src
->Swizzle
, 2),
1997 GET_SWZ(src
->Swizzle
, 3));
1999 /* Note this is ok for non-swizzle ARB_vp instructions */
2000 args
[i
].negate
= src
->Negate
? 1 : 0;
2002 args
[i
] = get_arg(c
, inst
, i
);
2005 /* Get dest regs. Note that it is possible for a reg to be both
2006 * dst and arg, given the static allocation of registers. So
2007 * care needs to be taken emitting multi-operation instructions.
2009 index
= inst
->DstReg
.Index
;
2010 file
= inst
->DstReg
.File
;
2011 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
2012 /* Can't just make get_dst "do the right thing" here because other
2013 * callers of get_dst don't expect any special behavior for the
2014 * c->output_regs[index].used_in_src case.
2016 dst
= brw_writemask(c
->output_regs
[index
].reg
, inst
->DstReg
.WriteMask
);
2018 dst
= get_dst(c
, inst
->DstReg
);
2020 if (inst
->SaturateMode
!= SATURATE_OFF
) {
2021 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
2022 inst
->SaturateMode
);
2025 switch (inst
->Opcode
) {
2027 args
[0].negate
= false;
2028 brw_MOV(p
, dst
, brw_abs(args
[0]));
2031 brw_ADD(p
, dst
, args
[0], args
[1]);
2034 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2037 brw_DP2(p
, dst
, args
[0], args
[1]);
2040 brw_DP3(p
, dst
, args
[0], args
[1]);
2043 brw_DP4(p
, dst
, args
[0], args
[1]);
2046 brw_DPH(p
, dst
, args
[0], args
[1]);
2049 emit_nrm(c
, dst
, args
[0], 3);
2052 emit_nrm(c
, dst
, args
[0], 4);
2055 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
2058 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
2061 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2064 emit_arl(p
, dst
, args
[0]);
2067 brw_RNDD(p
, dst
, args
[0]);
2070 brw_FRC(p
, dst
, args
[0]);
2073 unalias1(c
, dst
, args
[0], emit_log_noalias
);
2076 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2079 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
2082 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
2085 if (!accumulator_contains(c
, args
[2]))
2086 brw_MOV(p
, brw_acc_reg(), args
[2]);
2087 brw_MAC(p
, dst
, args
[0], args
[1]);
2090 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
2093 emit_max(p
, dst
, args
[0], args
[1]);
2096 emit_min(p
, dst
, args
[0], args
[1]);
2099 brw_MOV(p
, dst
, args
[0]);
2102 brw_MUL(p
, dst
, args
[0], args
[1]);
2105 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
2108 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2111 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, brw_abs(args
[0]), BRW_MATH_PRECISION_FULL
);
2115 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
2118 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2121 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
2124 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
2127 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
2130 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
2133 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
2136 unalias1(c
, dst
, args
[0], emit_sign
);
2139 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
2142 /* The args[0] value can't be used here as it won't have
2143 * correctly encoded the full swizzle:
2145 emit_swz(c
, dst
, inst
);
2148 /* round toward zero */
2149 brw_RNDZ(p
, dst
, args
[0]);
2152 emit_xpd(p
, dst
, args
[0], args
[1]);
2155 struct brw_instruction
*if_inst
= brw_IF(p
, BRW_EXECUTE_8
);
2156 /* Note that brw_IF smashes the predicate_control field. */
2157 if_inst
->header
.predicate_control
= get_predicate(inst
);
2158 if_depth_in_loop
[loop_depth
]++;
2162 clear_current_const(c
);
2166 clear_current_const(c
);
2168 if_depth_in_loop
[loop_depth
]--;
2170 case OPCODE_BGNLOOP
:
2171 clear_current_const(c
);
2172 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2173 if_depth_in_loop
[loop_depth
] = 0;
2176 brw_set_predicate_control(p
, get_predicate(inst
));
2177 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
2178 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2181 brw_set_predicate_control(p
, get_predicate(inst
));
2182 if (intel
->gen
>= 6) {
2183 gen6_CONT(p
, loop_inst
[loop_depth
- 1]);
2185 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
2187 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2190 case OPCODE_ENDLOOP
: {
2191 clear_current_const(c
);
2192 struct brw_instruction
*inst0
, *inst1
;
2197 if (intel
->gen
== 5)
2200 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2202 if (intel
->gen
< 6) {
2203 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2204 while (inst0
> loop_inst
[loop_depth
]) {
2206 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2207 inst0
->bits3
.if_else
.jump_count
== 0) {
2208 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2209 } else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2210 inst0
->bits3
.if_else
.jump_count
== 0) {
2211 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2219 brw_set_predicate_control(p
, get_predicate(inst
));
2220 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2221 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2224 brw_set_access_mode(p
, BRW_ALIGN_1
);
2225 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2226 brw_set_access_mode(p
, BRW_ALIGN_16
);
2227 brw_ADD(p
, get_addr_reg(stack_index
),
2228 get_addr_reg(stack_index
), brw_imm_d(4));
2229 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2230 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2233 brw_ADD(p
, get_addr_reg(stack_index
),
2234 get_addr_reg(stack_index
), brw_imm_d(-4));
2235 brw_set_access_mode(p
, BRW_ALIGN_1
);
2236 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2237 brw_set_access_mode(p
, BRW_ALIGN_16
);
2240 emit_vertex_write(c
);
2246 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2252 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2253 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2254 _mesa_opcode_string(inst
->Opcode
) :
2258 /* Set the predication update on the last instruction of the native
2259 * instruction sequence.
2261 * This would be problematic if it was set on a math instruction,
2262 * but that shouldn't be the case with the current GLSL compiler.
2264 if (inst
->CondUpdate
) {
2265 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2267 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2268 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2271 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2272 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2273 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2274 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2277 /* Result color clamping.
2279 * When destination register is an output register and
2280 * it's primary/secondary front/back color, we have to clamp
2281 * the result to [0,1]. This is done by enabling the
2282 * saturation bit for the last instruction.
2284 * We don't use brw_set_saturate() as it modifies
2285 * p->current->header.saturate, which affects all the subsequent
2286 * instructions. Instead, we directly modify the header
2287 * of the last (already stored) instruction.
2289 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
2290 c
->key
.clamp_vertex_color
) {
2291 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2292 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2293 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2294 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2295 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2299 if (inst
->DstReg
.RelAddr
) {
2300 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2301 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2302 move_to_reladdr_dst(c
, inst
, dst
);
2308 brw_resolve_cals(p
);
2313 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2316 printf("vs-native:\n");
2317 for (i
= 0; i
< p
->nr_insn
; i
++)
2318 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);