2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
, j
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
148 int vert_result_reoder
[VERT_RESULT_MAX
];
151 /* Determine whether to use a real constant buffer or use a block
152 * of GRF registers for constants. The later is faster but only
153 * works if everything fits in the GRF.
154 * XXX this heuristic/check may need some fine tuning...
156 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
157 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
158 c
->vp
->use_const_buffer
= GL_TRUE
;
160 c
->vp
->use_const_buffer
= GL_FALSE
;
162 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
164 /* r0 -- reserved as usual
166 c
->r0
= brw_vec8_grf(reg
, 0);
169 /* User clip planes from curbe:
171 if (c
->key
.nr_userclip
) {
172 if (intel
->gen
>= 6) {
173 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
174 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
175 (i
% 2) * 4), 0, 4, 1);
177 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
179 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
180 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
181 (i
% 2) * 4), 0, 4, 1);
183 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
188 /* Assign some (probably all) of the vertex program constants to
189 * the push constant buffer/CURBE.
191 * There's an obvious limit to the numer of push constants equal to
192 * the number of register available, and that number is smaller
193 * than the minimum maximum number of vertex program parameters, so
194 * support for pull constants is required if we overflow.
195 * Additionally, on gen6 the number of push constants is even
198 * When there's relative addressing, we don't know what range of
199 * Mesa IR registers can be accessed. And generally, when relative
200 * addressing is used we also have too many constants to load them
201 * all as push constants. So, we'll just support relative
202 * addressing out of the pull constant buffers, and try to load as
203 * many statically-accessed constants into the push constant buffer
206 if (intel
->gen
>= 6) {
207 /* We can only load 32 regs of push constants. */
208 max_constant
= 32 * 2 - c
->key
.nr_userclip
;
210 max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
213 /* constant_map maps from ParameterValues[] index to index in the
214 * push constant buffer, or -1 if it's only in the pull constant
217 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
219 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
221 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
224 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
225 if (inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
226 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
227 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
228 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
229 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) {
233 if (inst
->SrcReg
[arg
].RelAddr
) {
234 c
->vp
->use_const_buffer
= GL_TRUE
;
238 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
239 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
244 /* If we ran out of push constant space, then we'll also upload all
245 * constants through the pull constant buffer so that they can be
246 * accessed no matter what. For relative addressing (the common
247 * case) we need them all in place anyway.
249 if (constant
== max_constant
)
250 c
->vp
->use_const_buffer
= GL_TRUE
;
252 for (i
= 0; i
< constant
; i
++) {
253 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
257 reg
+= (constant
+ 1) / 2;
258 c
->prog_data
.curb_read_length
= reg
- 1;
259 c
->prog_data
.nr_params
= constant
* 4;
260 /* XXX 0 causes a bug elsewhere... */
261 if (intel
->gen
< 6 && c
->prog_data
.nr_params
== 0)
262 c
->prog_data
.nr_params
= 4;
264 /* Allocate input regs:
267 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
268 if (c
->prog_data
.inputs_read
& (1 << i
)) {
270 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
274 /* If there are no inputs, we'll still be reading one attribute's worth
275 * because it's required -- see urb_read_length setting.
277 if (c
->nr_inputs
== 0)
280 /* Allocate outputs. The non-position outputs go straight into message regs.
283 c
->first_output
= reg
;
284 c
->first_overflow_output
= 0;
286 if (intel
->gen
>= 6) {
288 if (c
->key
.nr_userclip
)
290 } else if (intel
->gen
== 5)
295 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
297 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
298 vert_result_reoder
[i
] = i
;
300 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
301 if (intel
->gen
>= 6 && c
->key
.two_side_color
) {
302 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL1
)) &&
303 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC1
))) {
304 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
));
305 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
));
307 } else if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
)) &&
308 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
)))
312 for (i
= 0; i
< bfc
; i
++) {
313 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 0] = VERT_RESULT_COL0
+ i
;
314 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 1] = VERT_RESULT_BFC0
+ i
;
317 for (i
= VERT_RESULT_COL0
+ bfc
* 2; i
< VERT_RESULT_BFC0
+ bfc
; i
++) {
318 vert_result_reoder
[i
] = i
- bfc
;
323 for (j
= 0; j
< VERT_RESULT_MAX
; j
++) {
324 i
= vert_result_reoder
[j
];
326 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
328 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
329 if (i
== VERT_RESULT_HPOS
) {
330 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
333 else if (i
== VERT_RESULT_PSIZ
) {
334 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
338 /* Two restrictions on our compute-to-MRF here. The
339 * message length for all SEND messages is restricted to
340 * [1,15], so we can't use mrf 15, as that means a length
343 * Additionally, URB writes are aligned to URB rows, so we
344 * need to put an even number of registers of URB data in
345 * each URB write so that the later write is aligned. A
346 * message length of 15 means 1 message header reg plus 14
349 * For attributes beyond the compute-to-MRF, we compute to
350 * GRFs and they will be written in the second URB_WRITE.
352 if (first_reladdr_output
> i
&& mrf
< 15) {
353 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
357 if (mrf
>= 15 && !c
->first_overflow_output
)
358 c
->first_overflow_output
= i
;
359 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
367 /* Allocate program temporaries:
369 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
370 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
374 /* Address reg(s). Don't try to use the internal address reg until
377 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
378 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
382 BRW_VERTICAL_STRIDE_8
,
384 BRW_HORIZONTAL_STRIDE_1
,
390 if (c
->vp
->use_const_buffer
) {
391 for (i
= 0; i
< 3; i
++) {
392 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
395 clear_current_const(c
);
398 for (i
= 0; i
< 128; i
++) {
399 if (c
->output_regs
[i
].used_in_src
) {
400 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
405 if (c
->needs_stack
) {
406 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
410 /* Some opcodes need an internal temporary:
413 c
->last_tmp
= reg
; /* for allocation purposes */
415 /* Each input reg holds data from two vertices. The
416 * urb_read_length is the number of registers read from *each*
417 * vertex urb, so is half the amount:
419 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
420 /* Setting this field to 0 leads to undefined behavior according to the
421 * the VS_STATE docs. Our VUEs will always have at least one attribute
422 * sitting in them, even if it's padding.
424 if (c
->prog_data
.urb_read_length
== 0)
425 c
->prog_data
.urb_read_length
= 1;
427 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
428 * them to fit the biggest thing they need to.
430 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
432 /* See emit_vertex_write() for where the VUE's overhead on top of the
433 * attributes comes from.
435 if (intel
->gen
>= 7) {
437 if (c
->key
.nr_userclip
)
440 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
441 * number of 64-byte (512-bit) units.
443 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 3) / 4;
444 } else if (intel
->gen
== 6) {
446 if (c
->key
.nr_userclip
)
449 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
450 * number of 128-byte (1024-bit) units.
452 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
453 } else if (intel
->gen
== 5)
454 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
455 * number of 64-byte (512-bit) units.
457 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
459 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
461 c
->prog_data
.total_grf
= reg
;
463 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
464 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
465 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
466 printf("%s reg = %d\n", __FUNCTION__
, reg
);
472 * If an instruction uses a temp reg both as a src and the dest, we
473 * sometimes need to allocate an intermediate temporary.
475 static void unalias1( struct brw_vs_compile
*c
,
478 void (*func
)( struct brw_vs_compile
*,
482 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
483 struct brw_compile
*p
= &c
->func
;
484 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
486 brw_MOV(p
, dst
, tmp
);
496 * Checkes if 2-operand instruction needs an intermediate temporary.
498 static void unalias2( struct brw_vs_compile
*c
,
502 void (*func
)( struct brw_vs_compile
*,
507 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
508 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
509 struct brw_compile
*p
= &c
->func
;
510 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
511 func(c
, tmp
, arg0
, arg1
);
512 brw_MOV(p
, dst
, tmp
);
516 func(c
, dst
, arg0
, arg1
);
522 * Checkes if 3-operand instruction needs an intermediate temporary.
524 static void unalias3( struct brw_vs_compile
*c
,
529 void (*func
)( struct brw_vs_compile
*,
535 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
536 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
537 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
538 struct brw_compile
*p
= &c
->func
;
539 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
540 func(c
, tmp
, arg0
, arg1
, arg2
);
541 brw_MOV(p
, dst
, tmp
);
545 func(c
, dst
, arg0
, arg1
, arg2
);
549 static void emit_sop( struct brw_vs_compile
*c
,
555 struct brw_compile
*p
= &c
->func
;
557 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
558 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
559 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
560 brw_set_predicate_control_flag_value(p
, 0xff);
563 static void emit_seq( struct brw_vs_compile
*c
,
566 struct brw_reg arg1
)
568 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
571 static void emit_sne( struct brw_vs_compile
*c
,
574 struct brw_reg arg1
)
576 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
578 static void emit_slt( struct brw_vs_compile
*c
,
581 struct brw_reg arg1
)
583 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
586 static void emit_sle( struct brw_vs_compile
*c
,
589 struct brw_reg arg1
)
591 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
594 static void emit_sgt( struct brw_vs_compile
*c
,
597 struct brw_reg arg1
)
599 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
602 static void emit_sge( struct brw_vs_compile
*c
,
605 struct brw_reg arg1
)
607 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
610 static void emit_cmp( struct brw_compile
*p
,
614 struct brw_reg arg2
)
616 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
617 brw_SEL(p
, dst
, arg1
, arg2
);
618 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
621 static void emit_sign(struct brw_vs_compile
*c
,
625 struct brw_compile
*p
= &c
->func
;
627 brw_MOV(p
, dst
, brw_imm_f(0));
629 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
630 brw_MOV(p
, dst
, brw_imm_f(-1.0));
631 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
633 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
634 brw_MOV(p
, dst
, brw_imm_f(1.0));
635 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
638 static void emit_max( struct brw_compile
*p
,
641 struct brw_reg arg1
)
643 struct intel_context
*intel
= &p
->brw
->intel
;
645 if (intel
->gen
>= 6) {
646 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
647 brw_SEL(p
, dst
, arg0
, arg1
);
648 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
649 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
651 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
652 brw_SEL(p
, dst
, arg0
, arg1
);
653 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
657 static void emit_min( struct brw_compile
*p
,
660 struct brw_reg arg1
)
662 struct intel_context
*intel
= &p
->brw
->intel
;
664 if (intel
->gen
>= 6) {
665 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
666 brw_SEL(p
, dst
, arg0
, arg1
);
667 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
668 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
670 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
671 brw_SEL(p
, dst
, arg0
, arg1
);
672 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
676 static void emit_arl(struct brw_compile
*p
,
680 struct intel_context
*intel
= &p
->brw
->intel
;
682 if (intel
->gen
>= 6) {
683 struct brw_reg dst_f
= retype(dst
, BRW_REGISTER_TYPE_F
);
685 brw_RNDD(p
, dst_f
, src
);
686 brw_MOV(p
, dst
, dst_f
);
688 brw_RNDD(p
, dst
, src
);
692 static void emit_math1_gen4(struct brw_vs_compile
*c
,
698 /* There are various odd behaviours with SEND on the simulator. In
699 * addition there are documented issues with the fact that the GEN4
700 * processor doesn't do dependency control properly on SEND
701 * results. So, on balance, this kludge to get around failures
702 * with writemasked math results looks like it might be necessary
703 * whether that turns out to be a simulator bug or not:
705 struct brw_compile
*p
= &c
->func
;
706 struct brw_reg tmp
= dst
;
707 GLboolean need_tmp
= GL_FALSE
;
709 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
710 dst
.dw1
.bits
.writemask
!= 0xf)
719 BRW_MATH_SATURATE_NONE
,
722 BRW_MATH_DATA_SCALAR
,
726 brw_MOV(p
, dst
, tmp
);
732 emit_math1_gen6(struct brw_vs_compile
*c
,
738 struct brw_compile
*p
= &c
->func
;
739 struct brw_reg tmp_src
, tmp_dst
;
741 /* Something is strange on gen6 math in 16-wide mode, though the
742 * docs say it's supposed to work. Punt to using align1 mode,
743 * which doesn't do writemasking and swizzles.
745 tmp_src
= get_tmp(c
);
746 tmp_dst
= get_tmp(c
);
748 brw_MOV(p
, tmp_src
, arg0
);
750 brw_set_access_mode(p
, BRW_ALIGN_1
);
754 BRW_MATH_SATURATE_NONE
,
757 BRW_MATH_DATA_SCALAR
,
759 brw_set_access_mode(p
, BRW_ALIGN_16
);
761 brw_MOV(p
, dst
, tmp_dst
);
763 release_tmp(c
, tmp_src
);
764 release_tmp(c
, tmp_dst
);
768 emit_math1(struct brw_vs_compile
*c
,
774 struct brw_compile
*p
= &c
->func
;
775 struct intel_context
*intel
= &p
->brw
->intel
;
778 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
780 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
783 static void emit_math2_gen4( struct brw_vs_compile
*c
,
790 struct brw_compile
*p
= &c
->func
;
791 struct brw_reg tmp
= dst
;
792 GLboolean need_tmp
= GL_FALSE
;
794 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
795 dst
.dw1
.bits
.writemask
!= 0xf)
801 brw_MOV(p
, brw_message_reg(3), arg1
);
806 BRW_MATH_SATURATE_NONE
,
809 BRW_MATH_DATA_SCALAR
,
813 brw_MOV(p
, dst
, tmp
);
818 static void emit_math2_gen6( struct brw_vs_compile
*c
,
825 struct brw_compile
*p
= &c
->func
;
826 struct brw_reg tmp_src0
, tmp_src1
, tmp_dst
;
828 tmp_src0
= get_tmp(c
);
829 tmp_src1
= get_tmp(c
);
830 tmp_dst
= get_tmp(c
);
832 brw_MOV(p
, tmp_src0
, arg0
);
833 brw_MOV(p
, tmp_src1
, arg1
);
835 brw_set_access_mode(p
, BRW_ALIGN_1
);
841 brw_set_access_mode(p
, BRW_ALIGN_16
);
843 brw_MOV(p
, dst
, tmp_dst
);
845 release_tmp(c
, tmp_src0
);
846 release_tmp(c
, tmp_src1
);
847 release_tmp(c
, tmp_dst
);
850 static void emit_math2( struct brw_vs_compile
*c
,
857 struct brw_compile
*p
= &c
->func
;
858 struct intel_context
*intel
= &p
->brw
->intel
;
861 emit_math2_gen6(c
, function
, dst
, arg0
, arg1
, precision
);
863 emit_math2_gen4(c
, function
, dst
, arg0
, arg1
, precision
);
866 static void emit_exp_noalias( struct brw_vs_compile
*c
,
868 struct brw_reg arg0
)
870 struct brw_compile
*p
= &c
->func
;
873 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
874 struct brw_reg tmp
= get_tmp(c
);
875 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
877 /* tmp_d = floor(arg0.x) */
878 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
880 /* result[0] = 2.0 ^ tmp */
882 /* Adjust exponent for floating point:
885 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
887 /* Install exponent and sign.
888 * Excess drops off the edge:
890 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
891 tmp_d
, brw_imm_d(23));
896 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
897 /* result[1] = arg0.x - floor(arg0.x) */
898 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
901 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
902 /* As with the LOG instruction, we might be better off just
903 * doing a taylor expansion here, seeing as we have to do all
906 * If mathbox partial precision is too low, consider also:
907 * result[3] = result[0] * EXP(result[1])
910 BRW_MATH_FUNCTION_EXP
,
911 brw_writemask(dst
, WRITEMASK_Z
),
912 brw_swizzle1(arg0
, 0),
913 BRW_MATH_PRECISION_FULL
);
916 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
917 /* result[3] = 1.0; */
918 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
923 static void emit_log_noalias( struct brw_vs_compile
*c
,
925 struct brw_reg arg0
)
927 struct brw_compile
*p
= &c
->func
;
928 struct brw_reg tmp
= dst
;
929 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
930 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
931 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
932 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
936 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
939 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
942 * These almost look likey they could be joined up, but not really
945 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
946 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
948 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
950 brw_writemask(tmp_ud
, WRITEMASK_X
),
951 brw_swizzle1(arg0_ud
, 0),
952 brw_imm_ud((1U<<31)-1));
955 brw_writemask(tmp_ud
, WRITEMASK_X
),
960 brw_writemask(tmp
, WRITEMASK_X
),
961 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
965 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
967 brw_writemask(tmp_ud
, WRITEMASK_Y
),
968 brw_swizzle1(arg0_ud
, 0),
969 brw_imm_ud((1<<23)-1));
972 brw_writemask(tmp_ud
, WRITEMASK_Y
),
974 brw_imm_ud(127<<23));
977 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
978 /* result[2] = result[0] + LOG2(result[1]); */
980 /* Why bother? The above is just a hint how to do this with a
981 * taylor series. Maybe we *should* use a taylor series as by
982 * the time all the above has been done it's almost certainly
983 * quicker than calling the mathbox, even with low precision.
986 * - result[0] + mathbox.LOG2(result[1])
987 * - mathbox.LOG2(arg0.x)
988 * - result[0] + inline_taylor_approx(result[1])
991 BRW_MATH_FUNCTION_LOG
,
992 brw_writemask(tmp
, WRITEMASK_Z
),
993 brw_swizzle1(tmp
, 1),
994 BRW_MATH_PRECISION_FULL
);
997 brw_writemask(tmp
, WRITEMASK_Z
),
998 brw_swizzle1(tmp
, 2),
999 brw_swizzle1(tmp
, 0));
1002 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
1003 /* result[3] = 1.0; */
1004 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
1008 brw_MOV(p
, dst
, tmp
);
1009 release_tmp(c
, tmp
);
1014 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1016 static void emit_dst_noalias( struct brw_vs_compile
*c
,
1018 struct brw_reg arg0
,
1019 struct brw_reg arg1
)
1021 struct brw_compile
*p
= &c
->func
;
1023 /* There must be a better way to do this:
1025 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
1026 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
1027 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
1028 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
1029 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
1030 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
1031 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
1032 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
1036 static void emit_xpd( struct brw_compile
*p
,
1041 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
1042 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
1046 static void emit_lit_noalias( struct brw_vs_compile
*c
,
1048 struct brw_reg arg0
)
1050 struct brw_compile
*p
= &c
->func
;
1051 struct brw_reg tmp
= dst
;
1052 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1057 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
1058 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
1060 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1061 * to get all channels active inside the IF. In the clipping code
1062 * we run with NoMask, so it's not an option and we can use
1063 * BRW_EXECUTE_1 for all comparisions.
1065 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
1066 brw_IF(p
, BRW_EXECUTE_8
);
1068 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
1070 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
1071 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
1072 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1075 BRW_MATH_FUNCTION_POW
,
1076 brw_writemask(dst
, WRITEMASK_Z
),
1077 brw_swizzle1(tmp
, 2),
1078 brw_swizzle1(arg0
, 3),
1079 BRW_MATH_PRECISION_PARTIAL
);
1083 release_tmp(c
, tmp
);
1086 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
1088 struct brw_reg arg0
,
1089 struct brw_reg arg1
,
1090 struct brw_reg arg2
)
1092 struct brw_compile
*p
= &c
->func
;
1094 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
1095 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
1096 brw_MAC(p
, dst
, arg0
, arg1
);
1099 /** 3 or 4-component vector normalization */
1100 static void emit_nrm( struct brw_vs_compile
*c
,
1102 struct brw_reg arg0
,
1105 struct brw_compile
*p
= &c
->func
;
1106 struct brw_reg tmp
= get_tmp(c
);
1108 /* tmp = dot(arg0, arg0) */
1110 brw_DP3(p
, tmp
, arg0
, arg0
);
1112 brw_DP4(p
, tmp
, arg0
, arg0
);
1114 /* tmp = 1 / sqrt(tmp) */
1115 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
1117 /* dst = arg0 * tmp */
1118 brw_MUL(p
, dst
, arg0
, tmp
);
1120 release_tmp(c
, tmp
);
1124 static struct brw_reg
1125 get_constant(struct brw_vs_compile
*c
,
1126 const struct prog_instruction
*inst
,
1129 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1130 struct brw_compile
*p
= &c
->func
;
1131 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1133 assert(argIndex
< 3);
1135 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1136 /* Keep track of the last constant loaded in this slot, for reuse. */
1137 c
->current_const
[argIndex
].index
= src
->Index
;
1140 printf(" fetch const[%d] for arg %d into reg %d\n",
1141 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1143 /* need to fetch the constant now */
1145 const_reg
, /* writeback dest */
1146 16 * src
->Index
, /* byte offset */
1147 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1151 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1152 const_reg
= stride(const_reg
, 0, 4, 1);
1153 const_reg
.subnr
= 0;
1158 static struct brw_reg
1159 get_reladdr_constant(struct brw_vs_compile
*c
,
1160 const struct prog_instruction
*inst
,
1163 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1164 struct brw_compile
*p
= &c
->func
;
1165 struct brw_context
*brw
= p
->brw
;
1166 struct intel_context
*intel
= &brw
->intel
;
1167 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1168 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1171 assert(argIndex
< 3);
1173 /* Can't reuse a reladdr constant load. */
1174 c
->current_const
[argIndex
].index
= -1;
1177 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1178 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1181 if (intel
->gen
>= 6) {
1182 offset
= src
->Index
;
1184 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1185 brw_MUL(p
, byte_addr_reg
, addr_reg
, brw_imm_d(16));
1186 addr_reg
= byte_addr_reg
;
1187 offset
= 16 * src
->Index
;
1190 /* fetch the first vec4 */
1191 brw_dp_READ_4_vs_relative(p
,
1195 SURF_INDEX_VERT_CONST_BUFFER
);
1202 /* TODO: relative addressing!
1204 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1205 gl_register_file file
,
1209 case PROGRAM_TEMPORARY
:
1211 case PROGRAM_OUTPUT
:
1212 assert(c
->regs
[file
][index
].nr
!= 0);
1213 return c
->regs
[file
][index
];
1214 case PROGRAM_STATE_VAR
:
1215 case PROGRAM_CONSTANT
:
1216 case PROGRAM_UNIFORM
:
1217 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1218 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1219 case PROGRAM_ADDRESS
:
1221 return c
->regs
[file
][index
];
1223 case PROGRAM_UNDEFINED
: /* undef values */
1224 return brw_null_reg();
1226 case PROGRAM_LOCAL_PARAM
:
1227 case PROGRAM_ENV_PARAM
:
1228 case PROGRAM_WRITE_ONLY
:
1231 return brw_null_reg();
1237 * Indirect addressing: get reg[[arg] + offset].
1239 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1244 struct brw_compile
*p
= &c
->func
;
1245 struct brw_reg tmp
= get_tmp(c
);
1246 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1247 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1248 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1249 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1250 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1252 /* Set the vertical stride on the register access so that the first
1253 * 4 components come from a0.0 and the second 4 from a0.1.
1255 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1258 brw_push_insn_state(p
);
1259 brw_set_access_mode(p
, BRW_ALIGN_1
);
1261 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1262 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1264 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1265 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1267 brw_MOV(p
, tmp
, indirect
);
1269 brw_pop_insn_state(p
);
1272 /* NOTE: tmp not released */
1277 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1278 const struct prog_instruction
*inst
,
1281 struct brw_compile
*p
= &c
->func
;
1283 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1284 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1285 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1286 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1287 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1288 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1290 /* Because destination register indirect addressing can only use
1291 * one index, we'll write each vertex's vec4 value separately.
1293 val
.width
= BRW_WIDTH_4
;
1294 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1296 brw_push_insn_state(p
);
1297 brw_set_access_mode(p
, BRW_ALIGN_1
);
1299 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1300 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1301 brw_MOV(p
, indirect
, val
);
1303 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1304 brw_ADD(p
, brw_address_reg(0), acc
,
1305 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1306 brw_MOV(p
, indirect
, suboffset(val
, 4));
1308 brw_pop_insn_state(p
);
1312 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1313 * TODO: relative addressing!
1315 static struct brw_reg
1316 get_src_reg( struct brw_vs_compile
*c
,
1317 const struct prog_instruction
*inst
,
1320 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1321 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1322 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1324 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1325 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1327 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1331 return brw_imm_f(0.0f
);
1332 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1337 return brw_imm_f(-1.0F
);
1339 return brw_imm_f(1.0F
);
1340 } else if (src
->File
== PROGRAM_CONSTANT
) {
1341 const struct gl_program_parameter_list
*params
;
1345 switch (src
->Swizzle
) {
1360 if (component
>= 0) {
1361 params
= c
->vp
->program
.Base
.Parameters
;
1362 f
= params
->ParameterValues
[src
->Index
][component
].f
;
1368 return brw_imm_f(f
);
1374 case PROGRAM_TEMPORARY
:
1376 case PROGRAM_OUTPUT
:
1378 return deref(c
, c
->regs
[file
][0], index
, 32);
1381 assert(c
->regs
[file
][index
].nr
!= 0);
1382 return c
->regs
[file
][index
];
1385 case PROGRAM_STATE_VAR
:
1386 case PROGRAM_CONSTANT
:
1387 case PROGRAM_UNIFORM
:
1388 case PROGRAM_ENV_PARAM
:
1389 case PROGRAM_LOCAL_PARAM
:
1390 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1391 /* Take from the push constant buffer if possible. */
1392 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1393 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1395 /* Must be in the pull constant buffer then .*/
1396 assert(c
->vp
->use_const_buffer
);
1398 return get_reladdr_constant(c
, inst
, argIndex
);
1400 return get_constant(c
, inst
, argIndex
);
1402 case PROGRAM_ADDRESS
:
1404 return c
->regs
[file
][index
];
1406 case PROGRAM_UNDEFINED
:
1407 /* this is a normal case since we loop over all three src args */
1408 return brw_null_reg();
1410 case PROGRAM_WRITE_ONLY
:
1413 return brw_null_reg();
1418 * Return the brw reg for the given instruction's src argument.
1419 * Will return mangled results for SWZ op. The emit_swz() function
1420 * ignores this result and recalculates taking extended swizzles into
1423 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1424 const struct prog_instruction
*inst
,
1427 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1430 if (src
->File
== PROGRAM_UNDEFINED
)
1431 return brw_null_reg();
1433 reg
= get_src_reg(c
, inst
, argIndex
);
1435 /* Convert 3-bit swizzle to 2-bit.
1437 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1438 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1439 GET_SWZ(src
->Swizzle
, 1),
1440 GET_SWZ(src
->Swizzle
, 2),
1441 GET_SWZ(src
->Swizzle
, 3));
1443 /* Note this is ok for non-swizzle ARB_vp instructions */
1444 reg
.negate
= src
->Negate
? 1 : 0;
1452 * Get brw register for the given program dest register.
1454 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1455 struct prog_dst_register dst
)
1460 case PROGRAM_TEMPORARY
:
1461 case PROGRAM_OUTPUT
:
1462 /* register-indirect addressing is only 1x1, not VxH, for
1463 * destination regs. So, for RelAddr we'll return a temporary
1464 * for the dest and do a move of the result to the RelAddr
1465 * register after the instruction emit.
1470 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1471 reg
= c
->regs
[dst
.File
][dst
.Index
];
1474 case PROGRAM_ADDRESS
:
1475 assert(dst
.Index
== 0);
1476 reg
= c
->regs
[dst
.File
][dst
.Index
];
1478 case PROGRAM_UNDEFINED
:
1479 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1480 reg
= brw_null_reg();
1484 reg
= brw_null_reg();
1487 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1488 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1494 static void emit_swz( struct brw_vs_compile
*c
,
1496 const struct prog_instruction
*inst
)
1498 const GLuint argIndex
= 0;
1499 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1500 struct brw_compile
*p
= &c
->func
;
1501 GLuint zeros_mask
= 0;
1502 GLuint ones_mask
= 0;
1503 GLuint src_mask
= 0;
1505 GLboolean need_tmp
= (src
.Negate
&&
1506 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1507 struct brw_reg tmp
= dst
;
1513 for (i
= 0; i
< 4; i
++) {
1514 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1515 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1534 /* Do src first, in case dst aliases src:
1537 struct brw_reg arg0
;
1539 arg0
= get_src_reg(c
, inst
, argIndex
);
1541 arg0
= brw_swizzle(arg0
,
1542 src_swz
[0], src_swz
[1],
1543 src_swz
[2], src_swz
[3]);
1545 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1549 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1552 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1555 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1558 brw_MOV(p
, dst
, tmp
);
1559 release_tmp(c
, tmp
);
1564 align_interleaved_urb_mlen(struct brw_context
*brw
, int mlen
)
1566 struct intel_context
*intel
= &brw
->intel
;
1568 if (intel
->gen
>= 6) {
1569 /* URB data written (does not include the message header reg) must
1570 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1571 * section 5.4.3.2.2: URB_INTERLEAVED.
1573 * URB entries are allocated on a multiple of 1024 bits, so an
1574 * extra 128 bits written here to make the end align to 256 is
1577 if ((mlen
% 2) != 1)
1585 * Post-vertex-program processing. Send the results to the URB.
1587 static void emit_vertex_write( struct brw_vs_compile
*c
)
1589 struct brw_compile
*p
= &c
->func
;
1590 struct brw_context
*brw
= p
->brw
;
1591 struct intel_context
*intel
= &brw
->intel
;
1592 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1595 GLuint len_vertex_header
= 2;
1599 if (c
->key
.copy_edgeflag
) {
1601 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1602 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1605 if (intel
->gen
< 6) {
1606 /* Build ndc coords */
1608 /* ndc = 1.0 / pos.w */
1609 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1610 /* ndc.xyz = pos * ndc */
1611 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1614 /* Update the header for point size, user clipping flags, and -ve rhw
1617 if (intel
->gen
>= 6) {
1618 struct brw_reg m1
= brw_message_reg(1);
1620 /* On gen6, m1 has each value in a separate dword, so we never
1621 * need to mess with a temporary for computing the m1 value.
1623 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1624 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1625 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1626 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1629 /* Set the user clip distances in dword 8-15. (m3-4)*/
1630 if (c
->key
.nr_userclip
) {
1631 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1634 m
= brw_message_reg(3);
1636 m
= brw_message_reg(4);
1638 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 7))),pos
, c
->userplane
[i
]);
1641 } else if ((c
->prog_data
.outputs_written
&
1642 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1643 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1644 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1647 brw_MOV(p
, header1
, brw_imm_ud(0));
1649 brw_set_access_mode(p
, BRW_ALIGN_16
);
1651 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1652 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1653 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1654 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1655 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1656 header1
, brw_imm_ud(0x7ff<<8));
1659 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1660 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1661 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1662 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1663 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1666 /* i965 clipping workaround:
1667 * 1) Test for -ve rhw
1669 * set ndc = (0,0,0,0)
1672 * Later, clipping will detect ucp[6] and ensure the primitive is
1673 * clipped against all fixed planes.
1675 if (brw
->has_negative_rhw_bug
) {
1677 vec8(brw_null_reg()),
1679 brw_swizzle1(ndc
, 3),
1682 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1683 brw_MOV(p
, ndc
, brw_imm_f(0));
1684 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1687 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1688 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1689 brw_set_access_mode(p
, BRW_ALIGN_16
);
1691 release_tmp(c
, header1
);
1694 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1697 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1698 * of zeros followed by two sets of NDC coordinates:
1700 brw_set_access_mode(p
, BRW_ALIGN_1
);
1701 brw_set_acc_write_control(p
, 0);
1703 /* The VUE layout is documented in Volume 2a. */
1704 if (intel
->gen
>= 6) {
1705 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1706 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1707 * dword 4-7 (m2) is the 4D space position
1708 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1710 * m3 or 5 is the first vertex element data we fill, which is
1711 * the vertex position.
1713 brw_MOV(p
, brw_message_reg(2), pos
);
1714 len_vertex_header
= 1;
1715 if (c
->key
.nr_userclip
> 0)
1716 len_vertex_header
+= 2;
1717 } else if (intel
->gen
== 5) {
1718 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1719 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1720 * dword 4-7 (m2) is the ndc position (set above)
1721 * dword 8-11 (m3) of the vertex header is the 4D space position
1722 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1723 * m6 is a pad so that the vertex element data is aligned
1724 * m7 is the first vertex data we fill, which is the vertex position.
1726 brw_MOV(p
, brw_message_reg(2), ndc
);
1727 brw_MOV(p
, brw_message_reg(3), pos
);
1728 brw_MOV(p
, brw_message_reg(7), pos
);
1729 len_vertex_header
= 6;
1731 /* There are 8 dwords in VUE header pre-Ironlake:
1732 * dword 0-3 (m1) is indices, point width, clip flags.
1733 * dword 4-7 (m2) is ndc position (set above)
1735 * dword 8-11 (m3) is the first vertex data, which we always have be the
1738 brw_MOV(p
, brw_message_reg(2), ndc
);
1739 brw_MOV(p
, brw_message_reg(3), pos
);
1740 len_vertex_header
= 2;
1743 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1744 next_mrf
= 2 + len_vertex_header
;
1745 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1746 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1748 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1750 if (i
== VERT_RESULT_PSIZ
)
1753 if (i
>= VERT_RESULT_TEX0
&&
1754 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1755 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1757 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1758 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1762 eot
= (c
->first_overflow_output
== 0);
1764 /* Message header, plus VUE header, plus the (first set of) outputs. */
1765 msg_len
= 1 + len_vertex_header
+ c
->nr_outputs
;
1766 msg_len
= align_interleaved_urb_mlen(brw
, msg_len
);
1767 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1768 msg_len
= MIN2(msg_len
, (BRW_MAX_MRF
- 1)),
1771 brw_null_reg(), /* dest */
1772 0, /* starting mrf reg nr */
1777 0, /* response len */
1779 eot
, /* writes complete */
1780 0, /* urb destination offset */
1781 BRW_URB_SWIZZLE_INTERLEAVE
);
1783 if (c
->first_overflow_output
> 0) {
1784 /* Not all of the vertex outputs/results fit into the MRF.
1785 * Move the overflowed attributes from the GRF to the MRF and
1786 * issue another brw_urb_WRITE().
1789 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1790 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1791 /* move from GRF to MRF */
1792 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1798 brw_null_reg(), /* dest */
1799 0, /* starting mrf reg nr */
1803 align_interleaved_urb_mlen(brw
, mrf
),
1804 0, /* response len */
1806 1, /* writes complete */
1807 14 / 2, /* urb destination offset */
1808 BRW_URB_SWIZZLE_INTERLEAVE
);
1813 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1815 struct brw_compile
*p
= &c
->func
;
1816 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1818 if (p
->nr_insn
== 0)
1821 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1824 switch (prev_insn
->header
.opcode
) {
1825 case BRW_OPCODE_MOV
:
1826 case BRW_OPCODE_MAC
:
1827 case BRW_OPCODE_MUL
:
1828 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1829 prev_insn
->header
.execution_size
== val
.width
&&
1830 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1831 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1832 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1833 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1834 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1835 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1845 get_predicate(const struct prog_instruction
*inst
)
1847 if (inst
->DstReg
.CondMask
== COND_TR
)
1848 return BRW_PREDICATE_NONE
;
1850 /* All of GLSL only produces predicates for COND_NE and one channel per
1851 * vector. Fail badly if someone starts doing something else, as it might
1852 * mean infinite looping or something.
1854 * We'd like to support all the condition codes, but our hardware doesn't
1855 * quite match the Mesa IR, which is modeled after the NV extensions. For
1856 * those, the instruction may update the condition codes or not, then any
1857 * later instruction may use one of those condition codes. For gen4, the
1858 * instruction may update the flags register based on one of the condition
1859 * codes output by the instruction, and then further instructions may
1860 * predicate on that. We can probably support this, but it won't
1861 * necessarily be easy.
1863 assert(inst
->DstReg
.CondMask
== COND_NE
);
1865 switch (inst
->DstReg
.CondSwizzle
) {
1867 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1869 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1871 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1873 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1875 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1876 inst
->DstReg
.CondMask
);
1877 return BRW_PREDICATE_NORMAL
;
1882 brw_vs_rescale_gl_fixed(struct brw_vs_compile
*c
)
1884 struct brw_compile
*p
= &c
->func
;
1887 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
1888 if (!(c
->prog_data
.inputs_read
& (1 << i
)))
1891 if (c
->key
.gl_fixed_input_size
[i
] != 0) {
1892 struct brw_reg reg
= c
->regs
[PROGRAM_INPUT
][i
];
1895 brw_writemask(reg
, (1 << c
->key
.gl_fixed_input_size
[i
]) - 1),
1896 reg
, brw_imm_f(1.0 / 65536.0));
1901 /* Emit the vertex program instructions here.
1903 void brw_vs_emit(struct brw_vs_compile
*c
)
1905 #define MAX_IF_DEPTH 32
1906 #define MAX_LOOP_DEPTH 32
1907 struct brw_compile
*p
= &c
->func
;
1908 struct brw_context
*brw
= p
->brw
;
1909 struct intel_context
*intel
= &brw
->intel
;
1910 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1911 GLuint insn
, loop_depth
= 0;
1912 struct brw_instruction
*loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1913 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1914 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1918 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1919 printf("vs-mesa:\n");
1920 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1925 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1926 brw_set_access_mode(p
, BRW_ALIGN_16
);
1927 if_depth_in_loop
[loop_depth
] = 0;
1929 brw_set_acc_write_control(p
, 1);
1931 for (insn
= 0; insn
< nr_insns
; insn
++) {
1933 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1935 /* Message registers can't be read, so copy the output into GRF
1936 * register if they are used in source registers
1938 for (i
= 0; i
< 3; i
++) {
1939 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1940 GLuint index
= src
->Index
;
1941 GLuint file
= src
->File
;
1942 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1943 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1946 switch (inst
->Opcode
) {
1949 c
->needs_stack
= GL_TRUE
;
1956 /* Static register allocation
1958 brw_vs_alloc_regs(c
);
1960 brw_vs_rescale_gl_fixed(c
);
1963 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1965 for (insn
= 0; insn
< nr_insns
; insn
++) {
1967 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1968 struct brw_reg args
[3], dst
;
1972 printf("%d: ", insn
);
1973 _mesa_print_instruction(inst
);
1976 /* Get argument regs. SWZ is special and does this itself.
1978 if (inst
->Opcode
!= OPCODE_SWZ
)
1979 for (i
= 0; i
< 3; i
++) {
1980 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1983 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1984 args
[i
] = c
->output_regs
[index
].reg
;
1986 args
[i
] = get_arg(c
, inst
, i
);
1989 /* Get dest regs. Note that it is possible for a reg to be both
1990 * dst and arg, given the static allocation of registers. So
1991 * care needs to be taken emitting multi-operation instructions.
1993 index
= inst
->DstReg
.Index
;
1994 file
= inst
->DstReg
.File
;
1995 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1996 dst
= c
->output_regs
[index
].reg
;
1998 dst
= get_dst(c
, inst
->DstReg
);
2000 if (inst
->SaturateMode
!= SATURATE_OFF
) {
2001 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
2002 inst
->SaturateMode
);
2005 switch (inst
->Opcode
) {
2007 args
[0].negate
= false;
2008 brw_MOV(p
, dst
, brw_abs(args
[0]));
2011 brw_ADD(p
, dst
, args
[0], args
[1]);
2014 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2017 brw_DP2(p
, dst
, args
[0], args
[1]);
2020 brw_DP3(p
, dst
, args
[0], args
[1]);
2023 brw_DP4(p
, dst
, args
[0], args
[1]);
2026 brw_DPH(p
, dst
, args
[0], args
[1]);
2029 emit_nrm(c
, dst
, args
[0], 3);
2032 emit_nrm(c
, dst
, args
[0], 4);
2035 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
2038 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
2041 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2044 emit_arl(p
, dst
, args
[0]);
2047 brw_RNDD(p
, dst
, args
[0]);
2050 brw_FRC(p
, dst
, args
[0]);
2053 unalias1(c
, dst
, args
[0], emit_log_noalias
);
2056 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2059 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
2062 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
2065 if (!accumulator_contains(c
, args
[2]))
2066 brw_MOV(p
, brw_acc_reg(), args
[2]);
2067 brw_MAC(p
, dst
, args
[0], args
[1]);
2070 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
2073 emit_max(p
, dst
, args
[0], args
[1]);
2076 emit_min(p
, dst
, args
[0], args
[1]);
2079 brw_MOV(p
, dst
, args
[0]);
2082 brw_MUL(p
, dst
, args
[0], args
[1]);
2085 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
2088 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2091 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, brw_abs(args
[0]), BRW_MATH_PRECISION_FULL
);
2095 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
2098 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2101 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
2104 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
2107 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
2110 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
2113 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
2116 unalias1(c
, dst
, args
[0], emit_sign
);
2119 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
2122 /* The args[0] value can't be used here as it won't have
2123 * correctly encoded the full swizzle:
2125 emit_swz(c
, dst
, inst
);
2128 /* round toward zero */
2129 brw_RNDZ(p
, dst
, args
[0]);
2132 emit_xpd(p
, dst
, args
[0], args
[1]);
2135 struct brw_instruction
*if_inst
= brw_IF(p
, BRW_EXECUTE_8
);
2136 /* Note that brw_IF smashes the predicate_control field. */
2137 if_inst
->header
.predicate_control
= get_predicate(inst
);
2138 if_depth_in_loop
[loop_depth
]++;
2142 clear_current_const(c
);
2146 clear_current_const(c
);
2148 if_depth_in_loop
[loop_depth
]--;
2150 case OPCODE_BGNLOOP
:
2151 clear_current_const(c
);
2152 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2153 if_depth_in_loop
[loop_depth
] = 0;
2156 brw_set_predicate_control(p
, get_predicate(inst
));
2157 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
2158 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2161 brw_set_predicate_control(p
, get_predicate(inst
));
2162 if (intel
->gen
>= 6) {
2163 gen6_CONT(p
, loop_inst
[loop_depth
- 1]);
2165 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
2167 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2170 case OPCODE_ENDLOOP
: {
2171 clear_current_const(c
);
2172 struct brw_instruction
*inst0
, *inst1
;
2177 if (intel
->gen
== 5)
2180 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2182 if (intel
->gen
< 6) {
2183 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2184 while (inst0
> loop_inst
[loop_depth
]) {
2186 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2187 inst0
->bits3
.if_else
.jump_count
== 0) {
2188 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2189 } else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2190 inst0
->bits3
.if_else
.jump_count
== 0) {
2191 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2199 brw_set_predicate_control(p
, get_predicate(inst
));
2200 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2201 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2204 brw_set_access_mode(p
, BRW_ALIGN_1
);
2205 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2206 brw_set_access_mode(p
, BRW_ALIGN_16
);
2207 brw_ADD(p
, get_addr_reg(stack_index
),
2208 get_addr_reg(stack_index
), brw_imm_d(4));
2209 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2210 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2213 brw_ADD(p
, get_addr_reg(stack_index
),
2214 get_addr_reg(stack_index
), brw_imm_d(-4));
2215 brw_set_access_mode(p
, BRW_ALIGN_1
);
2216 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2217 brw_set_access_mode(p
, BRW_ALIGN_16
);
2220 emit_vertex_write(c
);
2226 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2232 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2233 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2234 _mesa_opcode_string(inst
->Opcode
) :
2238 /* Set the predication update on the last instruction of the native
2239 * instruction sequence.
2241 * This would be problematic if it was set on a math instruction,
2242 * but that shouldn't be the case with the current GLSL compiler.
2244 if (inst
->CondUpdate
) {
2245 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2247 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2248 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2251 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2252 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2253 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2254 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2257 /* Result color clamping.
2259 * When destination register is an output register and
2260 * it's primary/secondary front/back color, we have to clamp
2261 * the result to [0,1]. This is done by enabling the
2262 * saturation bit for the last instruction.
2264 * We don't use brw_set_saturate() as it modifies
2265 * p->current->header.saturate, which affects all the subsequent
2266 * instructions. Instead, we directly modify the header
2267 * of the last (already stored) instruction.
2269 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
2270 c
->key
.clamp_vertex_color
) {
2271 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2272 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2273 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2274 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2275 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2279 if (inst
->DstReg
.RelAddr
) {
2280 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2281 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2282 move_to_reladdr_dst(c
, inst
, dst
);
2288 brw_resolve_cals(p
);
2293 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2296 printf("vs-native:\n");
2297 for (i
= 0; i
< p
->nr_insn
; i
++)
2298 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);