2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode
, int arg
)
46 int opcode_array
[] = {
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
69 if (opcode
== OPCODE_MAD
|| opcode
== OPCODE_LRP
) {
70 if (arg
== 1 || arg
== 2)
74 if (opcode
> ARRAY_SIZE(opcode_array
))
77 return arg
== opcode_array
[opcode
] - 1;
80 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
82 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
84 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
85 c
->prog_data
.total_grf
= c
->last_tmp
;
90 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
92 if (tmp
.nr
== c
->last_tmp
-1)
96 static void release_tmps( struct brw_vs_compile
*c
)
98 c
->last_tmp
= c
->first_tmp
;
102 get_first_reladdr_output(struct gl_vertex_program
*vp
)
105 int first_reladdr_output
= VERT_RESULT_MAX
;
107 for (i
= 0; i
< vp
->Base
.NumInstructions
; i
++) {
108 struct prog_instruction
*inst
= vp
->Base
.Instructions
+ i
;
110 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
111 inst
->DstReg
.RelAddr
&&
112 inst
->DstReg
.Index
< first_reladdr_output
)
113 first_reladdr_output
= inst
->DstReg
.Index
;
116 return first_reladdr_output
;
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
124 clear_current_const(struct brw_vs_compile
*c
)
128 if (c
->vp
->use_const_buffer
) {
129 for (i
= 0; i
< 3; i
++) {
130 c
->current_const
[i
].index
= -1;
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
140 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
142 struct intel_context
*intel
= &c
->func
.brw
->intel
;
143 GLuint i
, reg
= 0, mrf
, j
;
144 int attributes_in_vue
;
145 int first_reladdr_output
;
148 int vert_result_reoder
[VERT_RESULT_MAX
];
150 struct brw_vertex_program
*vp
= c
->vp
;
151 const struct gl_program_parameter_list
*params
= vp
->program
.Base
.Parameters
;
153 /* Determine whether to use a real constant buffer or use a block
154 * of GRF registers for constants. The later is faster but only
155 * works if everything fits in the GRF.
156 * XXX this heuristic/check may need some fine tuning...
158 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
159 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
160 c
->vp
->use_const_buffer
= GL_TRUE
;
162 c
->vp
->use_const_buffer
= GL_FALSE
;
164 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
166 /* r0 -- reserved as usual
168 c
->r0
= brw_vec8_grf(reg
, 0);
171 /* User clip planes from curbe:
173 if (c
->key
.nr_userclip
) {
174 if (intel
->gen
>= 6) {
175 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
176 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
177 (i
% 2) * 4), 0, 4, 1);
179 reg
+= ALIGN(c
->key
.nr_userclip
, 2) / 2;
181 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
182 c
->userplane
[i
] = stride(brw_vec4_grf(reg
+ (6 + i
) / 2,
183 (i
% 2) * 4), 0, 4, 1);
185 reg
+= (ALIGN(6 + c
->key
.nr_userclip
, 4) / 4) * 2;
190 /* Assign some (probably all) of the vertex program constants to
191 * the push constant buffer/CURBE.
193 * There's an obvious limit to the numer of push constants equal to
194 * the number of register available, and that number is smaller
195 * than the minimum maximum number of vertex program parameters, so
196 * support for pull constants is required if we overflow.
197 * Additionally, on gen6 the number of push constants is even
200 * When there's relative addressing, we don't know what range of
201 * Mesa IR registers can be accessed. And generally, when relative
202 * addressing is used we also have too many constants to load them
203 * all as push constants. So, we'll just support relative
204 * addressing out of the pull constant buffers, and try to load as
205 * many statically-accessed constants into the push constant buffer
208 if (intel
->gen
>= 6) {
209 /* We can only load 32 regs of push constants. */
210 max_constant
= 32 * 2 - c
->key
.nr_userclip
;
212 max_constant
= BRW_MAX_GRF
- 20 - c
->vp
->program
.Base
.NumTemporaries
;
215 /* constant_map maps from ParameterValues[] index to index in the
216 * push constant buffer, or -1 if it's only in the pull constant
219 memset(c
->constant_map
, -1, c
->vp
->program
.Base
.Parameters
->NumParameters
);
221 i
< c
->vp
->program
.Base
.NumInstructions
&& constant
< max_constant
;
223 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[i
];
226 for (arg
= 0; arg
< 3 && constant
< max_constant
; arg
++) {
227 if (inst
->SrcReg
[arg
].File
!= PROGRAM_STATE_VAR
&&
228 inst
->SrcReg
[arg
].File
!= PROGRAM_CONSTANT
&&
229 inst
->SrcReg
[arg
].File
!= PROGRAM_UNIFORM
&&
230 inst
->SrcReg
[arg
].File
!= PROGRAM_ENV_PARAM
&&
231 inst
->SrcReg
[arg
].File
!= PROGRAM_LOCAL_PARAM
) {
235 if (inst
->SrcReg
[arg
].RelAddr
) {
236 c
->vp
->use_const_buffer
= GL_TRUE
;
240 if (c
->constant_map
[inst
->SrcReg
[arg
].Index
] == -1) {
241 c
->constant_map
[inst
->SrcReg
[arg
].Index
] = constant
++;
246 /* If we ran out of push constant space, then we'll also upload all
247 * constants through the pull constant buffer so that they can be
248 * accessed no matter what. For relative addressing (the common
249 * case) we need them all in place anyway.
251 if (constant
== max_constant
)
252 c
->vp
->use_const_buffer
= GL_TRUE
;
254 /* Set up the references to the pull parameters if present. This backend
255 * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
256 * buffer, while the new VS backend allocates values to the pull buffer on
259 if (c
->vp
->use_const_buffer
) {
260 for (i
= 0; i
< params
->NumParameters
* 4; i
++) {
261 c
->prog_data
.pull_param
[i
] = ¶ms
->ParameterValues
[i
/ 4][i
% 4].f
;
263 c
->prog_data
.nr_pull_params
= i
;
266 for (i
= 0; i
< constant
; i
++) {
267 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride(brw_vec4_grf(reg
+ i
/ 2,
271 reg
+= (constant
+ 1) / 2;
272 c
->prog_data
.curb_read_length
= reg
- 1;
273 c
->prog_data
.nr_params
= constant
* 4;
274 /* XXX 0 causes a bug elsewhere... */
275 if (intel
->gen
< 6 && c
->prog_data
.nr_params
== 0)
276 c
->prog_data
.nr_params
= 4;
278 /* Allocate input regs:
281 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
282 if (c
->prog_data
.inputs_read
& (1 << i
)) {
284 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
288 /* If there are no inputs, we'll still be reading one attribute's worth
289 * because it's required -- see urb_read_length setting.
291 if (c
->nr_inputs
== 0)
294 /* Allocate outputs. The non-position outputs go straight into message regs.
297 c
->first_output
= reg
;
298 c
->first_overflow_output
= 0;
300 if (intel
->gen
>= 6) {
302 if (c
->key
.nr_userclip
)
304 } else if (intel
->gen
== 5)
309 first_reladdr_output
= get_first_reladdr_output(&c
->vp
->program
);
311 for (i
= 0; i
< VERT_RESULT_MAX
; i
++)
312 vert_result_reoder
[i
] = i
;
314 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
315 if (intel
->gen
>= 6 && c
->key
.two_side_color
) {
316 if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL1
)) &&
317 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC1
))) {
318 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
));
319 assert(c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
));
321 } else if ((c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_COL0
)) &&
322 (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_BFC0
)))
326 for (i
= 0; i
< bfc
; i
++) {
327 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 0] = VERT_RESULT_COL0
+ i
;
328 vert_result_reoder
[VERT_RESULT_COL0
+ i
* 2 + 1] = VERT_RESULT_BFC0
+ i
;
331 for (i
= VERT_RESULT_COL0
+ bfc
* 2; i
< VERT_RESULT_BFC0
+ bfc
; i
++) {
332 vert_result_reoder
[i
] = i
- bfc
;
337 for (j
= 0; j
< VERT_RESULT_MAX
; j
++) {
338 i
= vert_result_reoder
[j
];
340 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
342 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
343 if (i
== VERT_RESULT_HPOS
) {
344 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
347 else if (i
== VERT_RESULT_PSIZ
) {
348 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
352 /* Two restrictions on our compute-to-MRF here. The
353 * message length for all SEND messages is restricted to
354 * [1,15], so we can't use mrf 15, as that means a length
357 * Additionally, URB writes are aligned to URB rows, so we
358 * need to put an even number of registers of URB data in
359 * each URB write so that the later write is aligned. A
360 * message length of 15 means 1 message header reg plus 14
363 * For attributes beyond the compute-to-MRF, we compute to
364 * GRFs and they will be written in the second URB_WRITE.
366 if (first_reladdr_output
> i
&& mrf
< 15) {
367 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
371 if (mrf
>= 15 && !c
->first_overflow_output
)
372 c
->first_overflow_output
= i
;
373 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
381 /* Allocate program temporaries:
383 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
384 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
388 /* Address reg(s). Don't try to use the internal address reg until
391 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
392 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
396 BRW_VERTICAL_STRIDE_8
,
398 BRW_HORIZONTAL_STRIDE_1
,
404 if (c
->vp
->use_const_buffer
) {
405 for (i
= 0; i
< 3; i
++) {
406 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
409 clear_current_const(c
);
412 for (i
= 0; i
< 128; i
++) {
413 if (c
->output_regs
[i
].used_in_src
) {
414 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
419 if (c
->needs_stack
) {
420 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
424 /* Some opcodes need an internal temporary:
427 c
->last_tmp
= reg
; /* for allocation purposes */
429 /* Each input reg holds data from two vertices. The
430 * urb_read_length is the number of registers read from *each*
431 * vertex urb, so is half the amount:
433 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
434 /* Setting this field to 0 leads to undefined behavior according to the
435 * the VS_STATE docs. Our VUEs will always have at least one attribute
436 * sitting in them, even if it's padding.
438 if (c
->prog_data
.urb_read_length
== 0)
439 c
->prog_data
.urb_read_length
= 1;
441 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
442 * them to fit the biggest thing they need to.
444 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
446 /* See emit_vertex_write() for where the VUE's overhead on top of the
447 * attributes comes from.
449 if (intel
->gen
>= 7) {
451 if (c
->key
.nr_userclip
)
454 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
455 * number of 64-byte (512-bit) units.
457 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 3) / 4;
458 } else if (intel
->gen
== 6) {
460 if (c
->key
.nr_userclip
)
463 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
464 * number of 128-byte (1024-bit) units.
466 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ header_regs
+ 7) / 8;
467 } else if (intel
->gen
== 5)
468 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
469 * number of 64-byte (512-bit) units.
471 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
473 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
475 c
->prog_data
.total_grf
= reg
;
477 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
478 printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
479 printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
480 printf("%s reg = %d\n", __FUNCTION__
, reg
);
486 * If an instruction uses a temp reg both as a src and the dest, we
487 * sometimes need to allocate an intermediate temporary.
489 static void unalias1( struct brw_vs_compile
*c
,
492 void (*func
)( struct brw_vs_compile
*,
496 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
497 struct brw_compile
*p
= &c
->func
;
498 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
500 brw_MOV(p
, dst
, tmp
);
510 * Checkes if 2-operand instruction needs an intermediate temporary.
512 static void unalias2( struct brw_vs_compile
*c
,
516 void (*func
)( struct brw_vs_compile
*,
521 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
522 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
523 struct brw_compile
*p
= &c
->func
;
524 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
525 func(c
, tmp
, arg0
, arg1
);
526 brw_MOV(p
, dst
, tmp
);
530 func(c
, dst
, arg0
, arg1
);
536 * Checkes if 3-operand instruction needs an intermediate temporary.
538 static void unalias3( struct brw_vs_compile
*c
,
543 void (*func
)( struct brw_vs_compile
*,
549 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
550 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
551 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
552 struct brw_compile
*p
= &c
->func
;
553 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
554 func(c
, tmp
, arg0
, arg1
, arg2
);
555 brw_MOV(p
, dst
, tmp
);
559 func(c
, dst
, arg0
, arg1
, arg2
);
563 static void emit_sop( struct brw_vs_compile
*c
,
569 struct brw_compile
*p
= &c
->func
;
571 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
572 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
573 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
574 brw_set_predicate_control_flag_value(p
, 0xff);
577 static void emit_seq( struct brw_vs_compile
*c
,
580 struct brw_reg arg1
)
582 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
585 static void emit_sne( struct brw_vs_compile
*c
,
588 struct brw_reg arg1
)
590 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
592 static void emit_slt( struct brw_vs_compile
*c
,
595 struct brw_reg arg1
)
597 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
600 static void emit_sle( struct brw_vs_compile
*c
,
603 struct brw_reg arg1
)
605 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
608 static void emit_sgt( struct brw_vs_compile
*c
,
611 struct brw_reg arg1
)
613 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
616 static void emit_sge( struct brw_vs_compile
*c
,
619 struct brw_reg arg1
)
621 emit_sop(c
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
624 static void emit_cmp( struct brw_compile
*p
,
628 struct brw_reg arg2
)
630 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
631 brw_SEL(p
, dst
, arg1
, arg2
);
632 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
635 static void emit_sign(struct brw_vs_compile
*c
,
639 struct brw_compile
*p
= &c
->func
;
641 brw_MOV(p
, dst
, brw_imm_f(0));
643 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, brw_imm_f(0));
644 brw_MOV(p
, dst
, brw_imm_f(-1.0));
645 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
647 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, arg0
, brw_imm_f(0));
648 brw_MOV(p
, dst
, brw_imm_f(1.0));
649 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
652 static void emit_max( struct brw_compile
*p
,
655 struct brw_reg arg1
)
657 struct intel_context
*intel
= &p
->brw
->intel
;
659 if (intel
->gen
>= 6) {
660 brw_set_conditionalmod(p
, BRW_CONDITIONAL_GE
);
661 brw_SEL(p
, dst
, arg0
, arg1
);
662 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
663 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
665 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
666 brw_SEL(p
, dst
, arg0
, arg1
);
667 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
671 static void emit_min( struct brw_compile
*p
,
674 struct brw_reg arg1
)
676 struct intel_context
*intel
= &p
->brw
->intel
;
678 if (intel
->gen
>= 6) {
679 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
680 brw_SEL(p
, dst
, arg0
, arg1
);
681 brw_set_conditionalmod(p
, BRW_CONDITIONAL_NONE
);
682 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
684 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
685 brw_SEL(p
, dst
, arg0
, arg1
);
686 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
690 static void emit_arl(struct brw_compile
*p
,
694 struct intel_context
*intel
= &p
->brw
->intel
;
696 if (intel
->gen
>= 6) {
697 struct brw_reg dst_f
= retype(dst
, BRW_REGISTER_TYPE_F
);
699 brw_RNDD(p
, dst_f
, src
);
700 brw_MOV(p
, dst
, dst_f
);
702 brw_RNDD(p
, dst
, src
);
706 static void emit_math1_gen4(struct brw_vs_compile
*c
,
712 /* There are various odd behaviours with SEND on the simulator. In
713 * addition there are documented issues with the fact that the GEN4
714 * processor doesn't do dependency control properly on SEND
715 * results. So, on balance, this kludge to get around failures
716 * with writemasked math results looks like it might be necessary
717 * whether that turns out to be a simulator bug or not:
719 struct brw_compile
*p
= &c
->func
;
720 struct brw_reg tmp
= dst
;
721 GLboolean need_tmp
= GL_FALSE
;
723 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
724 dst
.dw1
.bits
.writemask
!= 0xf)
733 BRW_MATH_SATURATE_NONE
,
736 BRW_MATH_DATA_SCALAR
,
740 brw_MOV(p
, dst
, tmp
);
746 emit_math1_gen6(struct brw_vs_compile
*c
,
752 struct brw_compile
*p
= &c
->func
;
753 struct brw_reg tmp_src
, tmp_dst
;
755 /* Something is strange on gen6 math in 16-wide mode, though the
756 * docs say it's supposed to work. Punt to using align1 mode,
757 * which doesn't do writemasking and swizzles.
759 tmp_src
= get_tmp(c
);
760 tmp_dst
= get_tmp(c
);
762 brw_MOV(p
, tmp_src
, arg0
);
764 brw_set_access_mode(p
, BRW_ALIGN_1
);
768 BRW_MATH_SATURATE_NONE
,
771 BRW_MATH_DATA_SCALAR
,
773 brw_set_access_mode(p
, BRW_ALIGN_16
);
775 brw_MOV(p
, dst
, tmp_dst
);
777 release_tmp(c
, tmp_src
);
778 release_tmp(c
, tmp_dst
);
782 emit_math1(struct brw_vs_compile
*c
,
788 struct brw_compile
*p
= &c
->func
;
789 struct intel_context
*intel
= &p
->brw
->intel
;
792 emit_math1_gen6(c
, function
, dst
, arg0
, precision
);
794 emit_math1_gen4(c
, function
, dst
, arg0
, precision
);
797 static void emit_math2_gen4( struct brw_vs_compile
*c
,
804 struct brw_compile
*p
= &c
->func
;
805 struct brw_reg tmp
= dst
;
806 GLboolean need_tmp
= GL_FALSE
;
808 if (dst
.file
!= BRW_GENERAL_REGISTER_FILE
||
809 dst
.dw1
.bits
.writemask
!= 0xf)
815 brw_MOV(p
, brw_message_reg(3), arg1
);
820 BRW_MATH_SATURATE_NONE
,
823 BRW_MATH_DATA_SCALAR
,
827 brw_MOV(p
, dst
, tmp
);
832 static void emit_math2_gen6( struct brw_vs_compile
*c
,
839 struct brw_compile
*p
= &c
->func
;
840 struct brw_reg tmp_src0
, tmp_src1
, tmp_dst
;
842 tmp_src0
= get_tmp(c
);
843 tmp_src1
= get_tmp(c
);
844 tmp_dst
= get_tmp(c
);
846 brw_MOV(p
, tmp_src0
, arg0
);
847 brw_MOV(p
, tmp_src1
, arg1
);
849 brw_set_access_mode(p
, BRW_ALIGN_1
);
855 brw_set_access_mode(p
, BRW_ALIGN_16
);
857 brw_MOV(p
, dst
, tmp_dst
);
859 release_tmp(c
, tmp_src0
);
860 release_tmp(c
, tmp_src1
);
861 release_tmp(c
, tmp_dst
);
864 static void emit_math2( struct brw_vs_compile
*c
,
871 struct brw_compile
*p
= &c
->func
;
872 struct intel_context
*intel
= &p
->brw
->intel
;
875 emit_math2_gen6(c
, function
, dst
, arg0
, arg1
, precision
);
877 emit_math2_gen4(c
, function
, dst
, arg0
, arg1
, precision
);
880 static void emit_exp_noalias( struct brw_vs_compile
*c
,
882 struct brw_reg arg0
)
884 struct brw_compile
*p
= &c
->func
;
887 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
888 struct brw_reg tmp
= get_tmp(c
);
889 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
891 /* tmp_d = floor(arg0.x) */
892 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
894 /* result[0] = 2.0 ^ tmp */
896 /* Adjust exponent for floating point:
899 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
901 /* Install exponent and sign.
902 * Excess drops off the edge:
904 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
905 tmp_d
, brw_imm_d(23));
910 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
911 /* result[1] = arg0.x - floor(arg0.x) */
912 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
915 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
916 /* As with the LOG instruction, we might be better off just
917 * doing a taylor expansion here, seeing as we have to do all
920 * If mathbox partial precision is too low, consider also:
921 * result[3] = result[0] * EXP(result[1])
924 BRW_MATH_FUNCTION_EXP
,
925 brw_writemask(dst
, WRITEMASK_Z
),
926 brw_swizzle1(arg0
, 0),
927 BRW_MATH_PRECISION_FULL
);
930 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
931 /* result[3] = 1.0; */
932 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
937 static void emit_log_noalias( struct brw_vs_compile
*c
,
939 struct brw_reg arg0
)
941 struct brw_compile
*p
= &c
->func
;
942 struct brw_reg tmp
= dst
;
943 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
944 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
945 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
946 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
950 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
953 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
956 * These almost look likey they could be joined up, but not really
959 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
960 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
962 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
964 brw_writemask(tmp_ud
, WRITEMASK_X
),
965 brw_swizzle1(arg0_ud
, 0),
966 brw_imm_ud((1U<<31)-1));
969 brw_writemask(tmp_ud
, WRITEMASK_X
),
974 brw_writemask(tmp
, WRITEMASK_X
),
975 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
979 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
981 brw_writemask(tmp_ud
, WRITEMASK_Y
),
982 brw_swizzle1(arg0_ud
, 0),
983 brw_imm_ud((1<<23)-1));
986 brw_writemask(tmp_ud
, WRITEMASK_Y
),
988 brw_imm_ud(127<<23));
991 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
992 /* result[2] = result[0] + LOG2(result[1]); */
994 /* Why bother? The above is just a hint how to do this with a
995 * taylor series. Maybe we *should* use a taylor series as by
996 * the time all the above has been done it's almost certainly
997 * quicker than calling the mathbox, even with low precision.
1000 * - result[0] + mathbox.LOG2(result[1])
1001 * - mathbox.LOG2(arg0.x)
1002 * - result[0] + inline_taylor_approx(result[1])
1005 BRW_MATH_FUNCTION_LOG
,
1006 brw_writemask(tmp
, WRITEMASK_Z
),
1007 brw_swizzle1(tmp
, 1),
1008 BRW_MATH_PRECISION_FULL
);
1011 brw_writemask(tmp
, WRITEMASK_Z
),
1012 brw_swizzle1(tmp
, 2),
1013 brw_swizzle1(tmp
, 0));
1016 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
1017 /* result[3] = 1.0; */
1018 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
1022 brw_MOV(p
, dst
, tmp
);
1023 release_tmp(c
, tmp
);
1028 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1030 static void emit_dst_noalias( struct brw_vs_compile
*c
,
1032 struct brw_reg arg0
,
1033 struct brw_reg arg1
)
1035 struct brw_compile
*p
= &c
->func
;
1037 /* There must be a better way to do this:
1039 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
1040 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
1041 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
1042 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
1043 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
1044 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
1045 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
1046 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
1050 static void emit_xpd( struct brw_compile
*p
,
1055 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
1056 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
1060 static void emit_lit_noalias( struct brw_vs_compile
*c
,
1062 struct brw_reg arg0
)
1064 struct brw_compile
*p
= &c
->func
;
1065 struct brw_reg tmp
= dst
;
1066 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1071 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
1072 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
1074 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1075 * to get all channels active inside the IF. In the clipping code
1076 * we run with NoMask, so it's not an option and we can use
1077 * BRW_EXECUTE_1 for all comparisions.
1079 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
1080 brw_IF(p
, BRW_EXECUTE_8
);
1082 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
1084 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
1085 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
1086 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1089 BRW_MATH_FUNCTION_POW
,
1090 brw_writemask(dst
, WRITEMASK_Z
),
1091 brw_swizzle1(tmp
, 2),
1092 brw_swizzle1(arg0
, 3),
1093 BRW_MATH_PRECISION_PARTIAL
);
1097 release_tmp(c
, tmp
);
1100 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
1102 struct brw_reg arg0
,
1103 struct brw_reg arg1
,
1104 struct brw_reg arg2
)
1106 struct brw_compile
*p
= &c
->func
;
1108 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
1109 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
1110 brw_MAC(p
, dst
, arg0
, arg1
);
1113 static struct brw_reg
1114 get_constant(struct brw_vs_compile
*c
,
1115 const struct prog_instruction
*inst
,
1118 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1119 struct brw_compile
*p
= &c
->func
;
1120 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1122 assert(argIndex
< 3);
1124 if (c
->current_const
[argIndex
].index
!= src
->Index
) {
1125 /* Keep track of the last constant loaded in this slot, for reuse. */
1126 c
->current_const
[argIndex
].index
= src
->Index
;
1129 printf(" fetch const[%d] for arg %d into reg %d\n",
1130 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1132 /* need to fetch the constant now */
1134 const_reg
, /* writeback dest */
1135 16 * src
->Index
, /* byte offset */
1136 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
1140 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1141 const_reg
= stride(const_reg
, 0, 4, 1);
1142 const_reg
.subnr
= 0;
1147 static struct brw_reg
1148 get_reladdr_constant(struct brw_vs_compile
*c
,
1149 const struct prog_instruction
*inst
,
1152 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1153 struct brw_compile
*p
= &c
->func
;
1154 struct brw_context
*brw
= p
->brw
;
1155 struct intel_context
*intel
= &brw
->intel
;
1156 struct brw_reg const_reg
= c
->current_const
[argIndex
].reg
;
1157 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1160 assert(argIndex
< 3);
1162 /* Can't reuse a reladdr constant load. */
1163 c
->current_const
[argIndex
].index
= -1;
1166 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1167 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
1170 if (intel
->gen
>= 6) {
1171 offset
= src
->Index
;
1173 struct brw_reg byte_addr_reg
= retype(get_tmp(c
), BRW_REGISTER_TYPE_D
);
1174 brw_MUL(p
, byte_addr_reg
, addr_reg
, brw_imm_d(16));
1175 addr_reg
= byte_addr_reg
;
1176 offset
= 16 * src
->Index
;
1179 /* fetch the first vec4 */
1180 brw_dp_READ_4_vs_relative(p
,
1184 SURF_INDEX_VERT_CONST_BUFFER
);
1191 /* TODO: relative addressing!
1193 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
1194 gl_register_file file
,
1198 case PROGRAM_TEMPORARY
:
1200 case PROGRAM_OUTPUT
:
1201 assert(c
->regs
[file
][index
].nr
!= 0);
1202 return c
->regs
[file
][index
];
1203 case PROGRAM_STATE_VAR
:
1204 case PROGRAM_CONSTANT
:
1205 case PROGRAM_UNIFORM
:
1206 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
1207 return c
->regs
[PROGRAM_STATE_VAR
][index
];
1208 case PROGRAM_ADDRESS
:
1210 return c
->regs
[file
][index
];
1212 case PROGRAM_UNDEFINED
: /* undef values */
1213 return brw_null_reg();
1215 case PROGRAM_LOCAL_PARAM
:
1216 case PROGRAM_ENV_PARAM
:
1217 case PROGRAM_WRITE_ONLY
:
1220 return brw_null_reg();
1226 * Indirect addressing: get reg[[arg] + offset].
1228 static struct brw_reg
deref( struct brw_vs_compile
*c
,
1233 struct brw_compile
*p
= &c
->func
;
1234 struct brw_reg tmp
= get_tmp(c
);
1235 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1236 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1237 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* reg_size
;
1238 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1239 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1241 /* Set the vertical stride on the register access so that the first
1242 * 4 components come from a0.0 and the second 4 from a0.1.
1244 indirect
.vstride
= BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL
;
1247 brw_push_insn_state(p
);
1248 brw_set_access_mode(p
, BRW_ALIGN_1
);
1250 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1251 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1253 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1254 brw_ADD(p
, brw_address_reg(1), acc
, brw_imm_uw(byte_offset
));
1256 brw_MOV(p
, tmp
, indirect
);
1258 brw_pop_insn_state(p
);
1261 /* NOTE: tmp not released */
1266 move_to_reladdr_dst(struct brw_vs_compile
*c
,
1267 const struct prog_instruction
*inst
,
1270 struct brw_compile
*p
= &c
->func
;
1272 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
1273 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_D
);
1274 struct brw_reg base
= c
->regs
[inst
->DstReg
.File
][inst
->DstReg
.Index
];
1275 GLuint byte_offset
= base
.nr
* 32 + base
.subnr
;
1276 struct brw_reg indirect
= brw_vec4_indirect(0,0);
1277 struct brw_reg acc
= retype(vec1(get_tmp(c
)), BRW_REGISTER_TYPE_UW
);
1279 /* Because destination register indirect addressing can only use
1280 * one index, we'll write each vertex's vec4 value separately.
1282 val
.width
= BRW_WIDTH_4
;
1283 val
.vstride
= BRW_VERTICAL_STRIDE_4
;
1285 brw_push_insn_state(p
);
1286 brw_set_access_mode(p
, BRW_ALIGN_1
);
1288 brw_MUL(p
, acc
, vp_address
, brw_imm_uw(reg_size
));
1289 brw_ADD(p
, brw_address_reg(0), acc
, brw_imm_uw(byte_offset
));
1290 brw_MOV(p
, indirect
, val
);
1292 brw_MUL(p
, acc
, suboffset(vp_address
, 4), brw_imm_uw(reg_size
));
1293 brw_ADD(p
, brw_address_reg(0), acc
,
1294 brw_imm_uw(byte_offset
+ reg_size
/ 2));
1295 brw_MOV(p
, indirect
, suboffset(val
, 4));
1297 brw_pop_insn_state(p
);
1301 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1302 * TODO: relative addressing!
1304 static struct brw_reg
1305 get_src_reg( struct brw_vs_compile
*c
,
1306 const struct prog_instruction
*inst
,
1309 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
1310 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
1311 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
1313 if (brw_vs_arg_can_be_immediate(inst
->Opcode
, argIndex
)) {
1314 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1316 if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ZERO
,
1320 return brw_imm_f(0.0f
);
1321 } else if (src
->Swizzle
== MAKE_SWIZZLE4(SWIZZLE_ONE
,
1326 return brw_imm_f(-1.0F
);
1328 return brw_imm_f(1.0F
);
1329 } else if (src
->File
== PROGRAM_CONSTANT
) {
1330 const struct gl_program_parameter_list
*params
;
1334 switch (src
->Swizzle
) {
1349 if (component
>= 0) {
1350 params
= c
->vp
->program
.Base
.Parameters
;
1351 f
= params
->ParameterValues
[src
->Index
][component
].f
;
1357 return brw_imm_f(f
);
1363 case PROGRAM_TEMPORARY
:
1365 case PROGRAM_OUTPUT
:
1367 return deref(c
, c
->regs
[file
][0], index
, 32);
1370 assert(c
->regs
[file
][index
].nr
!= 0);
1371 return c
->regs
[file
][index
];
1374 case PROGRAM_STATE_VAR
:
1375 case PROGRAM_CONSTANT
:
1376 case PROGRAM_UNIFORM
:
1377 case PROGRAM_ENV_PARAM
:
1378 case PROGRAM_LOCAL_PARAM
:
1379 if (!relAddr
&& c
->constant_map
[index
] != -1) {
1380 /* Take from the push constant buffer if possible. */
1381 assert(c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]].nr
!= 0);
1382 return c
->regs
[PROGRAM_STATE_VAR
][c
->constant_map
[index
]];
1384 /* Must be in the pull constant buffer then .*/
1385 assert(c
->vp
->use_const_buffer
);
1387 return get_reladdr_constant(c
, inst
, argIndex
);
1389 return get_constant(c
, inst
, argIndex
);
1391 case PROGRAM_ADDRESS
:
1393 return c
->regs
[file
][index
];
1395 case PROGRAM_UNDEFINED
:
1396 /* this is a normal case since we loop over all three src args */
1397 return brw_null_reg();
1399 case PROGRAM_WRITE_ONLY
:
1402 return brw_null_reg();
1407 * Return the brw reg for the given instruction's src argument.
1408 * Will return mangled results for SWZ op. The emit_swz() function
1409 * ignores this result and recalculates taking extended swizzles into
1412 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1413 const struct prog_instruction
*inst
,
1416 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
1419 if (src
->File
== PROGRAM_UNDEFINED
)
1420 return brw_null_reg();
1422 reg
= get_src_reg(c
, inst
, argIndex
);
1424 /* Convert 3-bit swizzle to 2-bit.
1426 if (reg
.file
!= BRW_IMMEDIATE_VALUE
) {
1427 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1428 GET_SWZ(src
->Swizzle
, 1),
1429 GET_SWZ(src
->Swizzle
, 2),
1430 GET_SWZ(src
->Swizzle
, 3));
1432 /* Note this is ok for non-swizzle ARB_vp instructions */
1433 reg
.negate
= src
->Negate
? 1 : 0;
1441 * Get brw register for the given program dest register.
1443 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1444 struct prog_dst_register dst
)
1449 case PROGRAM_TEMPORARY
:
1450 case PROGRAM_OUTPUT
:
1451 /* register-indirect addressing is only 1x1, not VxH, for
1452 * destination regs. So, for RelAddr we'll return a temporary
1453 * for the dest and do a move of the result to the RelAddr
1454 * register after the instruction emit.
1459 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
1460 reg
= c
->regs
[dst
.File
][dst
.Index
];
1463 case PROGRAM_ADDRESS
:
1464 assert(dst
.Index
== 0);
1465 reg
= c
->regs
[dst
.File
][dst
.Index
];
1467 case PROGRAM_UNDEFINED
:
1468 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1469 reg
= brw_null_reg();
1473 reg
= brw_null_reg();
1476 assert(reg
.type
!= BRW_IMMEDIATE_VALUE
);
1477 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1483 static void emit_swz( struct brw_vs_compile
*c
,
1485 const struct prog_instruction
*inst
)
1487 const GLuint argIndex
= 0;
1488 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1489 struct brw_compile
*p
= &c
->func
;
1490 GLuint zeros_mask
= 0;
1491 GLuint ones_mask
= 0;
1492 GLuint src_mask
= 0;
1494 GLboolean need_tmp
= (src
.Negate
&&
1495 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1496 struct brw_reg tmp
= dst
;
1502 for (i
= 0; i
< 4; i
++) {
1503 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1504 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1523 /* Do src first, in case dst aliases src:
1526 struct brw_reg arg0
;
1528 arg0
= get_src_reg(c
, inst
, argIndex
);
1530 arg0
= brw_swizzle(arg0
,
1531 src_swz
[0], src_swz
[1],
1532 src_swz
[2], src_swz
[3]);
1534 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1538 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1541 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1544 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1547 brw_MOV(p
, dst
, tmp
);
1548 release_tmp(c
, tmp
);
1553 align_interleaved_urb_mlen(struct brw_context
*brw
, int mlen
)
1555 struct intel_context
*intel
= &brw
->intel
;
1557 if (intel
->gen
>= 6) {
1558 /* URB data written (does not include the message header reg) must
1559 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1560 * section 5.4.3.2.2: URB_INTERLEAVED.
1562 * URB entries are allocated on a multiple of 1024 bits, so an
1563 * extra 128 bits written here to make the end align to 256 is
1566 if ((mlen
% 2) != 1)
1574 * Post-vertex-program processing. Send the results to the URB.
1576 static void emit_vertex_write( struct brw_vs_compile
*c
)
1578 struct brw_compile
*p
= &c
->func
;
1579 struct brw_context
*brw
= p
->brw
;
1580 struct intel_context
*intel
= &brw
->intel
;
1581 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1584 GLuint len_vertex_header
= 2;
1588 if (c
->key
.copy_edgeflag
) {
1590 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1591 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1594 if (intel
->gen
< 6) {
1595 /* Build ndc coords */
1597 /* ndc = 1.0 / pos.w */
1598 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1599 /* ndc.xyz = pos * ndc */
1600 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1603 /* Update the header for point size, user clipping flags, and -ve rhw
1606 if (intel
->gen
>= 6) {
1607 struct brw_reg m1
= brw_message_reg(1);
1609 /* On gen6, m1 has each value in a separate dword, so we never
1610 * need to mess with a temporary for computing the m1 value.
1612 brw_MOV(p
, retype(m1
, BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1613 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1614 brw_MOV(p
, brw_writemask(m1
, WRITEMASK_W
),
1615 brw_swizzle1(c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
], 0));
1618 /* Set the user clip distances in dword 8-15. (m3-4)*/
1619 if (c
->key
.nr_userclip
) {
1620 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1623 m
= brw_message_reg(3);
1625 m
= brw_message_reg(4);
1627 brw_DP4(p
, brw_writemask(m
, (1 << (i
& 3))),pos
, c
->userplane
[i
]);
1630 } else if ((c
->prog_data
.outputs_written
&
1631 BITFIELD64_BIT(VERT_RESULT_PSIZ
)) ||
1632 c
->key
.nr_userclip
|| brw
->has_negative_rhw_bug
) {
1633 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1636 brw_MOV(p
, header1
, brw_imm_ud(0));
1638 brw_set_access_mode(p
, BRW_ALIGN_16
);
1640 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(VERT_RESULT_PSIZ
)) {
1641 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1642 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
),
1643 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1644 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
),
1645 header1
, brw_imm_ud(0x7ff<<8));
1648 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1649 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1650 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1651 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1652 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1655 /* i965 clipping workaround:
1656 * 1) Test for -ve rhw
1658 * set ndc = (0,0,0,0)
1661 * Later, clipping will detect ucp[6] and ensure the primitive is
1662 * clipped against all fixed planes.
1664 if (brw
->has_negative_rhw_bug
) {
1666 vec8(brw_null_reg()),
1668 brw_swizzle1(ndc
, 3),
1671 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1672 brw_MOV(p
, ndc
, brw_imm_f(0));
1673 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1676 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1677 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1678 brw_set_access_mode(p
, BRW_ALIGN_16
);
1680 release_tmp(c
, header1
);
1683 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1686 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1687 * of zeros followed by two sets of NDC coordinates:
1689 brw_set_access_mode(p
, BRW_ALIGN_1
);
1690 brw_set_acc_write_control(p
, 0);
1692 /* The VUE layout is documented in Volume 2a. */
1693 if (intel
->gen
>= 6) {
1694 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1695 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1696 * dword 4-7 (m2) is the 4D space position
1697 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1699 * m3 or 5 is the first vertex element data we fill, which is
1700 * the vertex position.
1702 brw_MOV(p
, brw_message_reg(2), pos
);
1703 len_vertex_header
= 1;
1704 if (c
->key
.nr_userclip
> 0)
1705 len_vertex_header
+= 2;
1706 } else if (intel
->gen
== 5) {
1707 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1708 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1709 * dword 4-7 (m2) is the ndc position (set above)
1710 * dword 8-11 (m3) of the vertex header is the 4D space position
1711 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1712 * m6 is a pad so that the vertex element data is aligned
1713 * m7 is the first vertex data we fill, which is the vertex position.
1715 brw_MOV(p
, brw_message_reg(2), ndc
);
1716 brw_MOV(p
, brw_message_reg(3), pos
);
1717 brw_MOV(p
, brw_message_reg(7), pos
);
1718 len_vertex_header
= 6;
1720 /* There are 8 dwords in VUE header pre-Ironlake:
1721 * dword 0-3 (m1) is indices, point width, clip flags.
1722 * dword 4-7 (m2) is ndc position (set above)
1724 * dword 8-11 (m3) is the first vertex data, which we always have be the
1727 brw_MOV(p
, brw_message_reg(2), ndc
);
1728 brw_MOV(p
, brw_message_reg(3), pos
);
1729 len_vertex_header
= 2;
1732 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1733 next_mrf
= 2 + len_vertex_header
;
1734 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
1735 if (c
->first_overflow_output
> 0 && i
>= c
->first_overflow_output
)
1737 if (!(c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)))
1739 if (i
== VERT_RESULT_PSIZ
)
1742 if (i
>= VERT_RESULT_TEX0
&&
1743 c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_GENERAL_REGISTER_FILE
) {
1744 brw_MOV(p
, brw_message_reg(next_mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1746 } else if (c
->regs
[PROGRAM_OUTPUT
][i
].file
== BRW_MESSAGE_REGISTER_FILE
) {
1747 next_mrf
= c
->regs
[PROGRAM_OUTPUT
][i
].nr
+ 1;
1751 eot
= (c
->first_overflow_output
== 0);
1753 /* Message header, plus VUE header, plus the (first set of) outputs. */
1754 msg_len
= 1 + len_vertex_header
+ c
->nr_outputs
;
1755 msg_len
= align_interleaved_urb_mlen(brw
, msg_len
);
1756 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1757 msg_len
= MIN2(msg_len
, (BRW_MAX_MRF
- 1)),
1760 brw_null_reg(), /* dest */
1761 0, /* starting mrf reg nr */
1766 0, /* response len */
1768 eot
, /* writes complete */
1769 0, /* urb destination offset */
1770 BRW_URB_SWIZZLE_INTERLEAVE
);
1772 if (c
->first_overflow_output
> 0) {
1773 /* Not all of the vertex outputs/results fit into the MRF.
1774 * Move the overflowed attributes from the GRF to the MRF and
1775 * issue another brw_urb_WRITE().
1778 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1779 if (c
->prog_data
.outputs_written
& BITFIELD64_BIT(i
)) {
1780 /* move from GRF to MRF */
1781 brw_MOV(p
, brw_message_reg(mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1787 brw_null_reg(), /* dest */
1788 0, /* starting mrf reg nr */
1792 align_interleaved_urb_mlen(brw
, mrf
),
1793 0, /* response len */
1795 1, /* writes complete */
1796 14 / 2, /* urb destination offset */
1797 BRW_URB_SWIZZLE_INTERLEAVE
);
1802 accumulator_contains(struct brw_vs_compile
*c
, struct brw_reg val
)
1804 struct brw_compile
*p
= &c
->func
;
1805 struct brw_instruction
*prev_insn
= &p
->store
[p
->nr_insn
- 1];
1807 if (p
->nr_insn
== 0)
1810 if (val
.address_mode
!= BRW_ADDRESS_DIRECT
)
1813 if (val
.negate
|| val
.abs
)
1816 switch (prev_insn
->header
.opcode
) {
1817 case BRW_OPCODE_MOV
:
1818 case BRW_OPCODE_MAC
:
1819 case BRW_OPCODE_MUL
:
1820 if (prev_insn
->header
.access_mode
== BRW_ALIGN_16
&&
1821 prev_insn
->header
.execution_size
== val
.width
&&
1822 prev_insn
->bits1
.da1
.dest_reg_file
== val
.file
&&
1823 prev_insn
->bits1
.da1
.dest_reg_type
== val
.type
&&
1824 prev_insn
->bits1
.da1
.dest_address_mode
== val
.address_mode
&&
1825 prev_insn
->bits1
.da1
.dest_reg_nr
== val
.nr
&&
1826 prev_insn
->bits1
.da16
.dest_subreg_nr
== val
.subnr
/ 16 &&
1827 prev_insn
->bits1
.da16
.dest_writemask
== 0xf)
1837 get_predicate(const struct prog_instruction
*inst
)
1839 if (inst
->DstReg
.CondMask
== COND_TR
)
1840 return BRW_PREDICATE_NONE
;
1842 /* All of GLSL only produces predicates for COND_NE and one channel per
1843 * vector. Fail badly if someone starts doing something else, as it might
1844 * mean infinite looping or something.
1846 * We'd like to support all the condition codes, but our hardware doesn't
1847 * quite match the Mesa IR, which is modeled after the NV extensions. For
1848 * those, the instruction may update the condition codes or not, then any
1849 * later instruction may use one of those condition codes. For gen4, the
1850 * instruction may update the flags register based on one of the condition
1851 * codes output by the instruction, and then further instructions may
1852 * predicate on that. We can probably support this, but it won't
1853 * necessarily be easy.
1855 assert(inst
->DstReg
.CondMask
== COND_NE
);
1857 switch (inst
->DstReg
.CondSwizzle
) {
1859 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1861 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1863 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1865 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1867 _mesa_problem(NULL
, "Unexpected predicate: 0x%08x\n",
1868 inst
->DstReg
.CondMask
);
1869 return BRW_PREDICATE_NORMAL
;
1874 brw_vs_rescale_gl_fixed(struct brw_vs_compile
*c
)
1876 struct brw_compile
*p
= &c
->func
;
1879 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
1880 if (!(c
->prog_data
.inputs_read
& (1 << i
)))
1883 if (c
->key
.gl_fixed_input_size
[i
] != 0) {
1884 struct brw_reg reg
= c
->regs
[PROGRAM_INPUT
][i
];
1887 brw_writemask(reg
, (1 << c
->key
.gl_fixed_input_size
[i
]) - 1),
1888 reg
, brw_imm_f(1.0 / 65536.0));
1893 /* Emit the vertex program instructions here.
1895 void brw_old_vs_emit(struct brw_vs_compile
*c
)
1897 #define MAX_IF_DEPTH 32
1898 #define MAX_LOOP_DEPTH 32
1899 struct brw_compile
*p
= &c
->func
;
1900 struct brw_context
*brw
= p
->brw
;
1901 struct intel_context
*intel
= &brw
->intel
;
1902 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1903 GLuint insn
, loop_depth
= 0;
1904 struct brw_instruction
*loop_inst
[MAX_LOOP_DEPTH
] = { 0 };
1905 int if_depth_in_loop
[MAX_LOOP_DEPTH
];
1906 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1910 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
1911 printf("vs-mesa:\n");
1912 _mesa_fprint_program_opt(stdout
, &c
->vp
->program
.Base
, PROG_PRINT_DEBUG
,
1917 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1918 brw_set_access_mode(p
, BRW_ALIGN_16
);
1919 if_depth_in_loop
[loop_depth
] = 0;
1921 brw_set_acc_write_control(p
, 1);
1923 for (insn
= 0; insn
< nr_insns
; insn
++) {
1925 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1927 /* Message registers can't be read, so copy the output into GRF
1928 * register if they are used in source registers
1930 for (i
= 0; i
< 3; i
++) {
1931 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1932 GLuint index
= src
->Index
;
1933 GLuint file
= src
->File
;
1934 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1935 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1938 switch (inst
->Opcode
) {
1941 c
->needs_stack
= GL_TRUE
;
1948 /* Static register allocation
1950 brw_vs_alloc_regs(c
);
1952 brw_vs_rescale_gl_fixed(c
);
1955 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1957 for (insn
= 0; insn
< nr_insns
; insn
++) {
1959 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1960 struct brw_reg args
[3], dst
;
1964 printf("%d: ", insn
);
1965 _mesa_print_instruction(inst
);
1968 /* Get argument regs. SWZ is special and does this itself.
1970 if (inst
->Opcode
!= OPCODE_SWZ
)
1971 for (i
= 0; i
< 3; i
++) {
1972 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1975 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
) {
1976 /* Can't just make get_arg "do the right thing" here because
1977 * other callers of get_arg and get_src_reg don't expect any
1978 * special behavior for the c->output_regs[index].used_in_src
1981 args
[i
] = c
->output_regs
[index
].reg
;
1982 args
[i
].dw1
.bits
.swizzle
=
1983 BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
1984 GET_SWZ(src
->Swizzle
, 1),
1985 GET_SWZ(src
->Swizzle
, 2),
1986 GET_SWZ(src
->Swizzle
, 3));
1988 /* Note this is ok for non-swizzle ARB_vp instructions */
1989 args
[i
].negate
= src
->Negate
? 1 : 0;
1991 args
[i
] = get_arg(c
, inst
, i
);
1994 /* Get dest regs. Note that it is possible for a reg to be both
1995 * dst and arg, given the static allocation of registers. So
1996 * care needs to be taken emitting multi-operation instructions.
1998 index
= inst
->DstReg
.Index
;
1999 file
= inst
->DstReg
.File
;
2000 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
2001 /* Can't just make get_dst "do the right thing" here because other
2002 * callers of get_dst don't expect any special behavior for the
2003 * c->output_regs[index].used_in_src case.
2005 dst
= brw_writemask(c
->output_regs
[index
].reg
, inst
->DstReg
.WriteMask
);
2007 dst
= get_dst(c
, inst
->DstReg
);
2009 if (inst
->SaturateMode
!= SATURATE_OFF
) {
2010 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
2011 inst
->SaturateMode
);
2014 switch (inst
->Opcode
) {
2016 args
[0].negate
= false;
2017 brw_MOV(p
, dst
, brw_abs(args
[0]));
2020 brw_ADD(p
, dst
, args
[0], args
[1]);
2023 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2026 brw_DP2(p
, dst
, args
[0], args
[1]);
2029 brw_DP3(p
, dst
, args
[0], args
[1]);
2032 brw_DP4(p
, dst
, args
[0], args
[1]);
2035 brw_DPH(p
, dst
, args
[0], args
[1]);
2038 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
2041 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
2044 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2047 emit_arl(p
, dst
, args
[0]);
2050 brw_RNDD(p
, dst
, args
[0]);
2053 brw_FRC(p
, dst
, args
[0]);
2056 unalias1(c
, dst
, args
[0], emit_log_noalias
);
2059 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2062 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
2065 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
2068 if (!accumulator_contains(c
, args
[2]))
2069 brw_MOV(p
, brw_acc_reg(), args
[2]);
2070 brw_MAC(p
, dst
, args
[0], args
[1]);
2073 emit_cmp(p
, dst
, args
[0], args
[1], args
[2]);
2076 emit_max(p
, dst
, args
[0], args
[1]);
2079 emit_min(p
, dst
, args
[0], args
[1]);
2082 brw_MOV(p
, dst
, args
[0]);
2085 brw_MUL(p
, dst
, args
[0], args
[1]);
2088 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
2091 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2094 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, brw_abs(args
[0]), BRW_MATH_PRECISION_FULL
);
2098 unalias2(c
, dst
, args
[0], args
[1], emit_seq
);
2101 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
2104 unalias2(c
, dst
, args
[0], args
[1], emit_sne
);
2107 unalias2(c
, dst
, args
[0], args
[1], emit_sge
);
2110 unalias2(c
, dst
, args
[0], args
[1], emit_sgt
);
2113 unalias2(c
, dst
, args
[0], args
[1], emit_slt
);
2116 unalias2(c
, dst
, args
[0], args
[1], emit_sle
);
2119 unalias1(c
, dst
, args
[0], emit_sign
);
2122 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
2125 /* The args[0] value can't be used here as it won't have
2126 * correctly encoded the full swizzle:
2128 emit_swz(c
, dst
, inst
);
2131 /* round toward zero */
2132 brw_RNDZ(p
, dst
, args
[0]);
2135 emit_xpd(p
, dst
, args
[0], args
[1]);
2138 struct brw_instruction
*if_inst
= brw_IF(p
, BRW_EXECUTE_8
);
2139 /* Note that brw_IF smashes the predicate_control field. */
2140 if_inst
->header
.predicate_control
= get_predicate(inst
);
2141 if_depth_in_loop
[loop_depth
]++;
2145 clear_current_const(c
);
2149 clear_current_const(c
);
2151 if_depth_in_loop
[loop_depth
]--;
2153 case OPCODE_BGNLOOP
:
2154 clear_current_const(c
);
2155 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
2156 if_depth_in_loop
[loop_depth
] = 0;
2159 brw_set_predicate_control(p
, get_predicate(inst
));
2160 brw_BREAK(p
, if_depth_in_loop
[loop_depth
]);
2161 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2164 brw_set_predicate_control(p
, get_predicate(inst
));
2165 if (intel
->gen
>= 6) {
2166 gen6_CONT(p
, loop_inst
[loop_depth
- 1]);
2168 brw_CONT(p
, if_depth_in_loop
[loop_depth
]);
2170 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2173 case OPCODE_ENDLOOP
: {
2174 clear_current_const(c
);
2175 struct brw_instruction
*inst0
, *inst1
;
2180 if (intel
->gen
== 5)
2183 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
2185 if (intel
->gen
< 6) {
2186 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2187 while (inst0
> loop_inst
[loop_depth
]) {
2189 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
&&
2190 inst0
->bits3
.if_else
.jump_count
== 0) {
2191 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
2192 } else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
&&
2193 inst0
->bits3
.if_else
.jump_count
== 0) {
2194 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
2202 brw_set_predicate_control(p
, get_predicate(inst
));
2203 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2204 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
2207 brw_set_access_mode(p
, BRW_ALIGN_1
);
2208 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
2209 brw_set_access_mode(p
, BRW_ALIGN_16
);
2210 brw_ADD(p
, get_addr_reg(stack_index
),
2211 get_addr_reg(stack_index
), brw_imm_d(4));
2212 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
2213 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2216 brw_ADD(p
, get_addr_reg(stack_index
),
2217 get_addr_reg(stack_index
), brw_imm_d(-4));
2218 brw_set_access_mode(p
, BRW_ALIGN_1
);
2219 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
2220 brw_set_access_mode(p
, BRW_ALIGN_16
);
2223 emit_vertex_write(c
);
2229 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
2235 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
2236 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
2237 _mesa_opcode_string(inst
->Opcode
) :
2241 /* Set the predication update on the last instruction of the native
2242 * instruction sequence.
2244 * This would be problematic if it was set on a math instruction,
2245 * but that shouldn't be the case with the current GLSL compiler.
2247 if (inst
->CondUpdate
) {
2248 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
2250 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
2251 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
2254 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
2255 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
2256 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
2257 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
2260 /* Result color clamping.
2262 * When destination register is an output register and
2263 * it's primary/secondary front/back color, we have to clamp
2264 * the result to [0,1]. This is done by enabling the
2265 * saturation bit for the last instruction.
2267 * We don't use brw_set_saturate() as it modifies
2268 * p->current->header.saturate, which affects all the subsequent
2269 * instructions. Instead, we directly modify the header
2270 * of the last (already stored) instruction.
2272 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
&&
2273 c
->key
.clamp_vertex_color
) {
2274 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
2275 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
2276 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
2277 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
2278 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
2282 if (inst
->DstReg
.RelAddr
) {
2283 assert(inst
->DstReg
.File
== PROGRAM_TEMPORARY
||
2284 inst
->DstReg
.File
== PROGRAM_OUTPUT
);
2285 move_to_reladdr_dst(c
, inst
, dst
);
2291 brw_resolve_cals(p
);
2296 if (unlikely(INTEL_DEBUG
& DEBUG_VS
)) {
2299 printf("vs-native:\n");
2300 for (i
= 0; i
< p
->nr_insn
; i
++)
2301 brw_disasm(stdout
, &p
->store
[i
], intel
->gen
);