2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
32 #include "pipe/p_shader_tokens.h"
34 #include "util/u_memory.h"
35 #include "util/u_math.h"
37 #include "tgsi/tgsi_parse.h"
38 #include "tgsi/tgsi_dump.h"
39 #include "tgsi/tgsi_info.h"
41 #include "brw_context.h"
43 #include "brw_debug.h"
44 #include "brw_disasm.h"
46 /* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
48 static INLINE
struct brw_reg
brw_vec4_grf_repeat( GLuint reg
, GLuint slot
)
50 int nr
= reg
+ slot
/2;
51 int subnr
= (slot
%2) * 4;
53 return stride(brw_vec4_grf(nr
, subnr
), 0, 4, 1);
57 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
59 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
61 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
62 c
->prog_data
.total_grf
= c
->last_tmp
;
67 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
69 if (tmp
.nr
== c
->last_tmp
-1)
73 static void release_tmps( struct brw_vs_compile
*c
)
75 c
->last_tmp
= c
->first_tmp
;
79 static boolean
is_position_output( struct brw_vs_compile
*c
,
82 struct brw_vertex_shader
*vs
= c
->vp
;
84 if (vs_output
== c
->prog_data
.output_edgeflag
) {
88 unsigned semantic
= vs
->info
.output_semantic_name
[vs_output
];
89 unsigned index
= vs
->info
.output_semantic_index
[vs_output
];
91 return (semantic
== TGSI_SEMANTIC_POSITION
&&
97 static boolean
find_output_slot( struct brw_vs_compile
*c
,
99 unsigned *fs_input_slot
)
101 struct brw_vertex_shader
*vs
= c
->vp
;
103 if (vs_output
== c
->prog_data
.output_edgeflag
) {
104 *fs_input_slot
= c
->key
.fs_signature
.nr_inputs
;
108 unsigned semantic
= vs
->info
.output_semantic_name
[vs_output
];
109 unsigned index
= vs
->info
.output_semantic_index
[vs_output
];
112 for (i
= 0; i
< c
->key
.fs_signature
.nr_inputs
; i
++) {
113 if (c
->key
.fs_signature
.input
[i
].semantic
== semantic
&&
114 c
->key
.fs_signature
.input
[i
].semantic_index
== index
) {
126 * Preallocate GRF register before code emit.
127 * Do things as simply as possible. Allocate and populate all regs
130 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
132 GLuint i
, reg
= 0, subreg
= 0, mrf
;
133 int attributes_in_vue
;
135 /* Determine whether to use a real constant buffer or use a block
136 * of GRF registers for constants. The later is faster but only
137 * works if everything fits in the GRF.
138 * XXX this heuristic/check may need some fine tuning...
140 if (c
->vp
->info
.file_max
[TGSI_FILE_CONSTANT
] + 1 +
141 c
->vp
->info
.file_max
[TGSI_FILE_IMMEDIATE
] + 1 +
142 c
->vp
->info
.file_max
[TGSI_FILE_TEMPORARY
] + 1 + 21 > BRW_MAX_GRF
)
143 c
->vp
->use_const_buffer
= GL_TRUE
;
145 /* XXX: immediates can go elsewhere if necessary:
147 assert(c
->vp
->info
.file_max
[TGSI_FILE_IMMEDIATE
] + 1 +
148 c
->vp
->info
.file_max
[TGSI_FILE_TEMPORARY
] + 1 + 21 <= BRW_MAX_GRF
);
150 c
->vp
->use_const_buffer
= GL_FALSE
;
153 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
155 /* r0 -- reserved as usual
157 c
->r0
= brw_vec8_grf(reg
, 0);
160 /* User clip planes from curbe:
162 if (c
->key
.nr_userclip
) {
163 /* Skip over fixed planes: Or never read them into vs unit?
167 for (i
= 0; i
< c
->key
.nr_userclip
; i
++, subreg
++) {
169 stride( brw_vec4_grf(reg
+subreg
/2, (subreg
%2) * 4), 0, 4, 1);
172 /* Deal with curbe alignment:
174 subreg
= align(subreg
, 2);
175 /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
179 /* Immediates: always in the curbe.
181 * XXX: Can try to encode some immediates as brw immediates
182 * XXX: Make sure ureg sets minimal immediate size and respect it
185 for (i
= 0; i
< c
->vp
->info
.immediate_count
; i
++, subreg
++) {
186 c
->regs
[TGSI_FILE_IMMEDIATE
][i
] =
187 stride( brw_vec4_grf(reg
+subreg
/2, (subreg
%2) * 4), 0, 4, 1);
189 c
->prog_data
.nr_params
= c
->vp
->info
.immediate_count
* 4;
192 /* Vertex constant buffer.
194 * Constants from the buffer can be either cached in the curbe or
195 * loaded as needed from the actual constant buffer.
197 if (!c
->vp
->use_const_buffer
) {
198 GLuint nr_params
= c
->vp
->info
.file_max
[TGSI_FILE_CONSTANT
] + 1;
200 for (i
= 0; i
< nr_params
; i
++, subreg
++) {
201 c
->regs
[TGSI_FILE_CONSTANT
][i
] =
202 stride( brw_vec4_grf(reg
+subreg
/2, (subreg
%2) * 4), 0, 4, 1);
205 c
->prog_data
.nr_params
+= nr_params
* 4;
208 /* All regs allocated
210 reg
+= (subreg
+ 1) / 2;
211 c
->prog_data
.curb_read_length
= reg
- 1;
214 /* Allocate input regs:
216 c
->nr_inputs
= c
->vp
->info
.num_inputs
;
217 for (i
= 0; i
< c
->nr_inputs
; i
++) {
218 c
->regs
[TGSI_FILE_INPUT
][i
] = brw_vec8_grf(reg
, 0);
222 /* If there are no inputs, we'll still be reading one attribute's worth
223 * because it's required -- see urb_read_length setting.
225 if (c
->nr_inputs
== 0)
230 /* Allocate outputs. The non-position outputs go straight into message regs.
232 c
->nr_outputs
= c
->prog_data
.nr_outputs
;
234 if (c
->chipset
.is_igdng
)
240 if (c
->key
.fs_signature
.nr_inputs
> BRW_MAX_MRF
) {
241 c
->overflow_grf_start
= reg
;
242 c
->overflow_count
= c
->key
.fs_signature
.nr_inputs
- BRW_MAX_MRF
;
243 reg
+= c
->overflow_count
;
246 /* XXX: need to access vertex output semantics here:
248 for (i
= 0; i
< c
->nr_outputs
; i
++) {
251 /* XXX: Put output position in slot zero always. Clipper, etc,
252 * need access to this reg.
254 if (is_position_output(c
, i
)) {
255 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0); /* copy to mrf 0 */
258 else if (find_output_slot(c
, i
, &slot
)) {
260 if (0 /* is_psize_output(c, i) */ ) {
261 /* c->psize_out.grf = reg; */
262 /* c->psize_out.mrf = i; */
265 /* The first (16-4) outputs can go straight into the message regs.
267 if (slot
+ mrf
< BRW_MAX_MRF
) {
268 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_message_reg(slot
+ mrf
);
271 int grf
= c
->overflow_grf_start
+ slot
- BRW_MAX_MRF
;
272 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(grf
, 0);
276 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_null_reg();
280 /* Allocate program temporaries:
283 for (i
= 0; i
< c
->vp
->info
.file_max
[TGSI_FILE_TEMPORARY
]+1; i
++) {
284 c
->regs
[TGSI_FILE_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
288 /* Address reg(s). Don't try to use the internal address reg until
291 for (i
= 0; i
< c
->vp
->info
.file_max
[TGSI_FILE_ADDRESS
]+1; i
++) {
292 c
->regs
[TGSI_FILE_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
296 BRW_VERTICAL_STRIDE_8
,
298 BRW_HORIZONTAL_STRIDE_1
,
304 if (c
->vp
->use_const_buffer
) {
305 for (i
= 0; i
< 3; i
++) {
306 c
->current_const
[i
].index
= -1;
307 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
313 for (i
= 0; i
< 128; i
++) {
314 if (c
->output_regs
[i
].used_in_src
) {
315 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
321 if (c
->vp
->has_flow_control
) {
322 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
326 /* Some opcodes need an internal temporary:
329 c
->last_tmp
= reg
; /* for allocation purposes */
331 /* Each input reg holds data from two vertices. The
332 * urb_read_length is the number of registers read from *each*
333 * vertex urb, so is half the amount:
335 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
337 /* Setting this field to 0 leads to undefined behavior according to the
338 * the VS_STATE docs. Our VUEs will always have at least one attribute
339 * sitting in them, even if it's padding.
341 if (c
->prog_data
.urb_read_length
== 0)
342 c
->prog_data
.urb_read_length
= 1;
344 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
345 * them to fit the biggest thing they need to.
347 attributes_in_vue
= MAX2(c
->nr_outputs
, c
->nr_inputs
);
349 if (c
->chipset
.is_igdng
)
350 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 6 + 3) / 4;
352 c
->prog_data
.urb_entry_size
= (attributes_in_vue
+ 2 + 3) / 4;
354 c
->prog_data
.total_grf
= reg
;
356 if (BRW_DEBUG
& DEBUG_VS
) {
357 debug_printf("%s NumAddrRegs %d\n", __FUNCTION__
,
358 c
->vp
->info
.file_max
[TGSI_FILE_ADDRESS
]+1);
359 debug_printf("%s NumTemps %d\n", __FUNCTION__
,
360 c
->vp
->info
.file_max
[TGSI_FILE_TEMPORARY
]+1);
361 debug_printf("%s reg = %d\n", __FUNCTION__
, reg
);
367 * If an instruction uses a temp reg both as a src and the dest, we
368 * sometimes need to allocate an intermediate temporary.
370 static void unalias1( struct brw_vs_compile
*c
,
373 void (*func
)( struct brw_vs_compile
*,
377 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
378 struct brw_compile
*p
= &c
->func
;
379 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
381 brw_MOV(p
, dst
, tmp
);
391 * Checkes if 2-operand instruction needs an intermediate temporary.
393 static void unalias2( struct brw_vs_compile
*c
,
397 void (*func
)( struct brw_vs_compile
*,
402 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
403 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
404 struct brw_compile
*p
= &c
->func
;
405 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
406 func(c
, tmp
, arg0
, arg1
);
407 brw_MOV(p
, dst
, tmp
);
411 func(c
, dst
, arg0
, arg1
);
417 * Checkes if 3-operand instruction needs an intermediate temporary.
419 static void unalias3( struct brw_vs_compile
*c
,
424 void (*func
)( struct brw_vs_compile
*,
430 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
431 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
432 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
433 struct brw_compile
*p
= &c
->func
;
434 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
435 func(c
, tmp
, arg0
, arg1
, arg2
);
436 brw_MOV(p
, dst
, tmp
);
440 func(c
, dst
, arg0
, arg1
, arg2
);
444 static void emit_sop( struct brw_compile
*p
,
450 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
451 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
452 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
453 brw_set_predicate_control_flag_value(p
, 0xff);
456 static void emit_seq( struct brw_compile
*p
,
459 struct brw_reg arg1
)
461 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
464 static void emit_sne( struct brw_compile
*p
,
467 struct brw_reg arg1
)
469 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
471 static void emit_slt( struct brw_compile
*p
,
474 struct brw_reg arg1
)
476 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
479 static void emit_sle( struct brw_compile
*p
,
482 struct brw_reg arg1
)
484 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
487 static void emit_sgt( struct brw_compile
*p
,
490 struct brw_reg arg1
)
492 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
495 static void emit_sge( struct brw_compile
*p
,
498 struct brw_reg arg1
)
500 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
503 static void emit_max( struct brw_compile
*p
,
506 struct brw_reg arg1
)
508 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
509 brw_SEL(p
, dst
, arg1
, arg0
);
510 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
513 static void emit_min( struct brw_compile
*p
,
516 struct brw_reg arg1
)
518 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
519 brw_SEL(p
, dst
, arg0
, arg1
);
520 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
524 static void emit_math1( struct brw_vs_compile
*c
,
530 /* There are various odd behaviours with SEND on the simulator. In
531 * addition there are documented issues with the fact that the GEN4
532 * processor doesn't do dependency control properly on SEND
533 * results. So, on balance, this kludge to get around failures
534 * with writemasked math results looks like it might be necessary
535 * whether that turns out to be a simulator bug or not:
537 struct brw_compile
*p
= &c
->func
;
538 struct brw_reg tmp
= dst
;
539 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
540 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
548 BRW_MATH_SATURATE_NONE
,
551 BRW_MATH_DATA_SCALAR
,
555 brw_MOV(p
, dst
, tmp
);
561 static void emit_math2( struct brw_vs_compile
*c
,
568 struct brw_compile
*p
= &c
->func
;
569 struct brw_reg tmp
= dst
;
570 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
571 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
576 brw_MOV(p
, brw_message_reg(3), arg1
);
581 BRW_MATH_SATURATE_NONE
,
584 BRW_MATH_DATA_SCALAR
,
588 brw_MOV(p
, dst
, tmp
);
594 static void emit_exp_noalias( struct brw_vs_compile
*c
,
596 struct brw_reg arg0
)
598 struct brw_compile
*p
= &c
->func
;
601 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_X
) {
602 struct brw_reg tmp
= get_tmp(c
);
603 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
605 /* tmp_d = floor(arg0.x) */
606 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
608 /* result[0] = 2.0 ^ tmp */
610 /* Adjust exponent for floating point:
613 brw_ADD(p
, brw_writemask(tmp_d
, BRW_WRITEMASK_X
), tmp_d
, brw_imm_d(127));
615 /* Install exponent and sign.
616 * Excess drops off the edge:
618 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), BRW_WRITEMASK_X
),
619 tmp_d
, brw_imm_d(23));
624 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_Y
) {
625 /* result[1] = arg0.x - floor(arg0.x) */
626 brw_FRC(p
, brw_writemask(dst
, BRW_WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
629 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_Z
) {
630 /* As with the LOG instruction, we might be better off just
631 * doing a taylor expansion here, seeing as we have to do all
634 * If mathbox partial precision is too low, consider also:
635 * result[3] = result[0] * EXP(result[1])
638 BRW_MATH_FUNCTION_EXP
,
639 brw_writemask(dst
, BRW_WRITEMASK_Z
),
640 brw_swizzle1(arg0
, 0),
641 BRW_MATH_PRECISION_FULL
);
644 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_W
) {
645 /* result[3] = 1.0; */
646 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_W
), brw_imm_f(1));
651 static void emit_log_noalias( struct brw_vs_compile
*c
,
653 struct brw_reg arg0
)
655 struct brw_compile
*p
= &c
->func
;
656 struct brw_reg tmp
= dst
;
657 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
658 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
659 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
660 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
664 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
667 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
670 * These almost look likey they could be joined up, but not really
673 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
674 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
676 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_XZ
) {
678 brw_writemask(tmp_ud
, BRW_WRITEMASK_X
),
679 brw_swizzle1(arg0_ud
, 0),
680 brw_imm_ud((1U<<31)-1));
683 brw_writemask(tmp_ud
, BRW_WRITEMASK_X
),
688 brw_writemask(tmp
, BRW_WRITEMASK_X
),
689 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
693 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_YZ
) {
695 brw_writemask(tmp_ud
, BRW_WRITEMASK_Y
),
696 brw_swizzle1(arg0_ud
, 0),
697 brw_imm_ud((1<<23)-1));
700 brw_writemask(tmp_ud
, BRW_WRITEMASK_Y
),
702 brw_imm_ud(127<<23));
705 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_Z
) {
706 /* result[2] = result[0] + LOG2(result[1]); */
708 /* Why bother? The above is just a hint how to do this with a
709 * taylor series. Maybe we *should* use a taylor series as by
710 * the time all the above has been done it's almost certainly
711 * quicker than calling the mathbox, even with low precision.
714 * - result[0] + mathbox.LOG2(result[1])
715 * - mathbox.LOG2(arg0.x)
716 * - result[0] + inline_taylor_approx(result[1])
719 BRW_MATH_FUNCTION_LOG
,
720 brw_writemask(tmp
, BRW_WRITEMASK_Z
),
721 brw_swizzle1(tmp
, 1),
722 BRW_MATH_PRECISION_FULL
);
725 brw_writemask(tmp
, BRW_WRITEMASK_Z
),
726 brw_swizzle1(tmp
, 2),
727 brw_swizzle1(tmp
, 0));
730 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_W
) {
731 /* result[3] = 1.0; */
732 brw_MOV(p
, brw_writemask(tmp
, BRW_WRITEMASK_W
), brw_imm_f(1));
736 brw_MOV(p
, dst
, tmp
);
742 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
744 static void emit_dst_noalias( struct brw_vs_compile
*c
,
749 struct brw_compile
*p
= &c
->func
;
751 /* There must be a better way to do this:
753 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_X
)
754 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_X
), brw_imm_f(1.0));
755 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_Y
)
756 brw_MUL(p
, brw_writemask(dst
, BRW_WRITEMASK_Y
), arg0
, arg1
);
757 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_Z
)
758 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_Z
), arg0
);
759 if (dst
.dw1
.bits
.writemask
& BRW_WRITEMASK_W
)
760 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_W
), arg1
);
764 static void emit_xpd( struct brw_compile
*p
,
769 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
770 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
774 static void emit_lit_noalias( struct brw_vs_compile
*c
,
776 struct brw_reg arg0
)
778 struct brw_compile
*p
= &c
->func
;
779 struct brw_instruction
*if_insn
;
780 struct brw_reg tmp
= dst
;
781 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
786 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_YZ
), brw_imm_f(0));
787 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_XW
), brw_imm_f(1));
789 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
790 * to get all channels active inside the IF. In the clipping code
791 * we run with NoMask, so it's not an option and we can use
792 * BRW_EXECUTE_1 for all comparisions.
794 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
795 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
797 brw_MOV(p
, brw_writemask(dst
, BRW_WRITEMASK_Y
), brw_swizzle1(arg0
,0));
799 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
800 brw_MOV(p
, brw_writemask(tmp
, BRW_WRITEMASK_Z
), brw_swizzle1(arg0
,1));
801 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
804 BRW_MATH_FUNCTION_POW
,
805 brw_writemask(dst
, BRW_WRITEMASK_Z
),
806 brw_swizzle1(tmp
, 2),
807 brw_swizzle1(arg0
, 3),
808 BRW_MATH_PRECISION_PARTIAL
);
811 brw_ENDIF(p
, if_insn
);
816 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
822 struct brw_compile
*p
= &c
->func
;
824 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
825 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
826 brw_MAC(p
, dst
, arg0
, arg1
);
829 /** 3 or 4-component vector normalization */
830 static void emit_nrm( struct brw_vs_compile
*c
,
835 struct brw_compile
*p
= &c
->func
;
836 struct brw_reg tmp
= get_tmp(c
);
838 /* tmp = dot(arg0, arg0) */
840 brw_DP3(p
, tmp
, arg0
, arg0
);
842 brw_DP4(p
, tmp
, arg0
, arg0
);
844 /* tmp = 1 / sqrt(tmp) */
845 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
847 /* dst = arg0 * tmp */
848 brw_MUL(p
, dst
, arg0
, tmp
);
854 static struct brw_reg
855 get_constant(struct brw_vs_compile
*c
,
860 struct brw_compile
*p
= &c
->func
;
861 struct brw_reg const_reg
;
862 struct brw_reg const2_reg
;
864 assert(argIndex
< 3);
866 if (c
->current_const
[argIndex
].index
!= index
|| relAddr
) {
867 struct brw_reg addrReg
= c
->regs
[TGSI_FILE_ADDRESS
][0];
869 c
->current_const
[argIndex
].index
= index
;
872 printf(" fetch const[%d] for arg %d into reg %d\n",
873 src
.Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
875 /* need to fetch the constant now */
877 c
->current_const
[argIndex
].reg
,/* writeback dest */
879 relAddr
, /* relative indexing? */
880 addrReg
, /* address register */
881 16 * index
, /* byte offset */
882 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
887 const2_reg
= get_tmp(c
);
889 /* use upper half of address reg for second read */
890 addrReg
= stride(addrReg
, 0, 4, 0);
894 const2_reg
, /* writeback dest */
896 relAddr
, /* relative indexing? */
897 addrReg
, /* address register */
898 16 * index
, /* byte offset */
899 SURF_INDEX_VERT_CONST_BUFFER
904 const_reg
= c
->current_const
[argIndex
].reg
;
907 /* merge the two Owords into the constant register */
908 /* const_reg[7..4] = const2_reg[7..4] */
910 suboffset(stride(const_reg
, 0, 4, 1), 4),
911 suboffset(stride(const2_reg
, 0, 4, 1), 4));
912 release_tmp(c
, const2_reg
);
915 /* replicate lower four floats into upper half (to get XYZWXYZW) */
916 const_reg
= stride(const_reg
, 0, 4, 0);
925 /* TODO: relative addressing!
927 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
928 enum tgsi_file_type file
,
932 case TGSI_FILE_TEMPORARY
:
933 case TGSI_FILE_INPUT
:
934 case TGSI_FILE_OUTPUT
:
935 case TGSI_FILE_CONSTANT
:
936 assert(c
->regs
[file
][index
].nr
!= 0);
937 return c
->regs
[file
][index
];
939 case TGSI_FILE_ADDRESS
:
941 return c
->regs
[file
][index
];
943 case TGSI_FILE_NULL
: /* undef values */
944 return brw_null_reg();
948 return brw_null_reg();
954 * Indirect addressing: get reg[[arg] + offset].
956 static struct brw_reg
deref( struct brw_vs_compile
*c
,
960 struct brw_compile
*p
= &c
->func
;
961 struct brw_reg tmp
= vec4(get_tmp(c
));
962 struct brw_reg addr_reg
= c
->regs
[TGSI_FILE_ADDRESS
][0];
963 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
964 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
965 struct brw_reg indirect
= brw_vec4_indirect(0,0);
968 brw_push_insn_state(p
);
969 brw_set_access_mode(p
, BRW_ALIGN_1
);
971 /* This is pretty clunky - load the address register twice and
972 * fetch each 4-dword value in turn. There must be a way to do
973 * this in a single pass, but I couldn't get it to work.
975 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
976 brw_MOV(p
, tmp
, indirect
);
978 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
979 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
981 brw_pop_insn_state(p
);
984 /* NOTE: tmp not released */
990 * Get brw reg corresponding to the instruction's [argIndex] src reg.
991 * TODO: relative addressing!
993 static struct brw_reg
994 get_src_reg( struct brw_vs_compile
*c
,
1002 case TGSI_FILE_TEMPORARY
:
1003 case TGSI_FILE_INPUT
:
1004 case TGSI_FILE_OUTPUT
:
1006 return deref(c
, c
->regs
[file
][0], index
);
1009 assert(c
->regs
[file
][index
].nr
!= 0);
1010 return c
->regs
[file
][index
];
1013 case TGSI_FILE_IMMEDIATE
:
1014 return c
->regs
[file
][index
];
1016 case TGSI_FILE_CONSTANT
:
1017 if (c
->vp
->use_const_buffer
) {
1018 return get_constant(c
, argIndex
, index
, relAddr
);
1021 return deref(c
, c
->regs
[TGSI_FILE_CONSTANT
][0], index
);
1024 assert(c
->regs
[TGSI_FILE_CONSTANT
][index
].nr
!= 0);
1025 return c
->regs
[TGSI_FILE_CONSTANT
][index
];
1027 case TGSI_FILE_ADDRESS
:
1029 return c
->regs
[file
][index
];
1031 case TGSI_FILE_NULL
:
1032 /* this is a normal case since we loop over all three src args */
1033 return brw_null_reg();
1037 return brw_null_reg();
1042 static void emit_arl( struct brw_vs_compile
*c
,
1044 struct brw_reg arg0
)
1046 struct brw_compile
*p
= &c
->func
;
1047 struct brw_reg tmp
= dst
;
1048 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1053 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
1054 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
1057 release_tmp(c
, tmp
);
1062 * Return the brw reg for the given instruction's src argument.
1064 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
1065 const struct tgsi_full_src_register
*src
,
1070 if (src
->Register
.File
== TGSI_FILE_NULL
)
1071 return brw_null_reg();
1073 reg
= get_src_reg(c
, argIndex
,
1075 src
->Register
.Index
,
1076 src
->Register
.Indirect
);
1078 /* Convert 3-bit swizzle to 2-bit.
1080 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(src
->Register
.SwizzleX
,
1081 src
->Register
.SwizzleY
,
1082 src
->Register
.SwizzleZ
,
1083 src
->Register
.SwizzleW
);
1085 reg
.negate
= src
->Register
.Negate
? 1 : 0;
1095 * Get brw register for the given program dest register.
1097 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
1100 unsigned writemask
)
1105 case TGSI_FILE_TEMPORARY
:
1106 case TGSI_FILE_OUTPUT
:
1107 assert(c
->regs
[file
][index
].nr
!= 0);
1108 reg
= c
->regs
[file
][index
];
1110 case TGSI_FILE_ADDRESS
:
1112 reg
= c
->regs
[file
][index
];
1114 case TGSI_FILE_NULL
:
1115 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1116 reg
= brw_null_reg();
1120 reg
= brw_null_reg();
1123 reg
.dw1
.bits
.writemask
= writemask
;
1132 * Post-vertex-program processing. Send the results to the URB.
1134 static void emit_vertex_write( struct brw_vs_compile
*c
)
1136 struct brw_compile
*p
= &c
->func
;
1137 struct brw_reg m0
= brw_message_reg(0);
1138 struct brw_reg pos
= c
->regs
[TGSI_FILE_OUTPUT
][VERT_RESULT_HPOS
];
1142 GLuint len_vertext_header
= 2;
1144 if (c
->key
.copy_edgeflag
) {
1146 get_reg(c
, TGSI_FILE_OUTPUT
, c
->prog_data
.output_edgeflag
),
1150 /* Build ndc coords */
1152 /* ndc = 1.0 / pos.w */
1153 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1154 /* ndc.xyz = pos * ndc */
1155 brw_MUL(p
, brw_writemask(ndc
, BRW_WRITEMASK_XYZ
), pos
, ndc
);
1157 /* Update the header for point size, user clipping flags, and -ve rhw
1160 if (c
->prog_data
.writes_psiz
||
1161 c
->key
.nr_userclip
||
1164 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1167 brw_MOV(p
, header1
, brw_imm_ud(0));
1169 brw_set_access_mode(p
, BRW_ALIGN_16
);
1171 if (c
->prog_data
.writes_psiz
) {
1172 struct brw_reg psiz
= c
->regs
[TGSI_FILE_OUTPUT
][VERT_RESULT_PSIZ
];
1173 brw_MUL(p
, brw_writemask(header1
, BRW_WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1174 brw_AND(p
, brw_writemask(header1
, BRW_WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1177 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1178 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1179 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1180 brw_OR(p
, brw_writemask(header1
, BRW_WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1181 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1184 /* i965 clipping workaround:
1185 * 1) Test for -ve rhw
1187 * set ndc = (0,0,0,0)
1190 * Later, clipping will detect ucp[6] and ensure the primitive is
1191 * clipped against all fixed planes.
1193 if (c
->chipset
.is_965
) {
1195 vec8(brw_null_reg()),
1197 brw_swizzle1(ndc
, 3),
1200 brw_OR(p
, brw_writemask(header1
, BRW_WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1201 brw_MOV(p
, ndc
, brw_imm_f(0));
1202 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1205 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1206 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1207 brw_set_access_mode(p
, BRW_ALIGN_16
);
1209 release_tmp(c
, header1
);
1212 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1215 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1216 * of zeros followed by two sets of NDC coordinates:
1218 brw_set_access_mode(p
, BRW_ALIGN_1
);
1219 brw_MOV(p
, offset(m0
, 2), ndc
);
1221 if (c
->chipset
.is_igdng
) {
1222 /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
1223 brw_MOV(p
, offset(m0
, 3), pos
); /* a portion of vertex header */
1224 /* m4, m5 contain the distances from vertex to the user clip planeXXX.
1225 * Seems it is useless for us.
1226 * m6 is used for aligning, so that the remainder of vertex element is
1229 brw_MOV(p
, offset(m0
, 7), pos
); /* the remainder of vertex element */
1230 len_vertext_header
= 6;
1232 brw_MOV(p
, offset(m0
, 3), pos
);
1233 len_vertext_header
= 2;
1236 eot
= (c
->overflow_count
== 0);
1239 brw_null_reg(), /* dest */
1240 0, /* starting mrf reg nr */
1244 MIN2(c
->nr_outputs
+ 1 + len_vertext_header
, (BRW_MAX_MRF
-1)), /* msg len */
1245 0, /* response len */
1247 eot
, /* writes complete */
1248 0, /* urb destination offset */
1249 BRW_URB_SWIZZLE_INTERLEAVE
);
1251 /* Not all of the vertex outputs/results fit into the MRF.
1252 * Move the overflowed attributes from the GRF to the MRF and
1253 * issue another brw_urb_WRITE().
1255 for (i
= 0; i
< c
->overflow_count
; i
+= BRW_MAX_MRF
) {
1256 unsigned nr
= MIN2(c
->overflow_count
- i
, BRW_MAX_MRF
);
1259 eot
= (i
+ nr
>= c
->overflow_count
);
1261 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1264 for (j
= 0; j
< nr
; j
++) {
1265 brw_MOV(p
, brw_message_reg(4+j
),
1266 brw_vec8_grf(c
->overflow_grf_start
+ i
+ j
, 0));
1270 brw_null_reg(), /* dest */
1271 4, /* starting mrf reg nr */
1276 0, /* response len */
1278 eot
, /* writes complete */
1279 i
-1, /* urb destination offset */
1280 BRW_URB_SWIZZLE_INTERLEAVE
);
1286 * Called after code generation to resolve subroutine calls and the
1288 * \param end_inst points to brw code for END instruction
1289 * \param last_inst points to last instruction emitted before vertex write
1292 post_vs_emit( struct brw_vs_compile
*c
,
1293 struct brw_instruction
*end_inst
,
1294 struct brw_instruction
*last_inst
)
1298 brw_resolve_cals(&c
->func
);
1300 /* patch up the END code to jump past subroutines, etc */
1301 offset
= last_inst
- end_inst
;
1303 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1305 end_inst
->header
.opcode
= BRW_OPCODE_NOP
;
1310 get_predicate(const struct tgsi_full_instruction
*inst
)
1312 /* XXX: disabling for now
1315 if (inst
->dst
.CondMask
== COND_TR
)
1316 return BRW_PREDICATE_NONE
;
1318 /* All of GLSL only produces predicates for COND_NE and one channel per
1319 * vector. Fail badly if someone starts doing something else, as it might
1320 * mean infinite looping or something.
1322 * We'd like to support all the condition codes, but our hardware doesn't
1323 * quite match the Mesa IR, which is modeled after the NV extensions. For
1324 * those, the instruction may update the condition codes or not, then any
1325 * later instruction may use one of those condition codes. For gen4, the
1326 * instruction may update the flags register based on one of the condition
1327 * codes output by the instruction, and then further instructions may
1328 * predicate on that. We can probably support this, but it won't
1329 * necessarily be easy.
1331 /* assert(inst->dst.CondMask == COND_NE); */
1333 switch (inst
->dst
.CondSwizzle
) {
1335 return BRW_PREDICATE_ALIGN16_REPLICATE_X
;
1337 return BRW_PREDICATE_ALIGN16_REPLICATE_Y
;
1339 return BRW_PREDICATE_ALIGN16_REPLICATE_Z
;
1341 return BRW_PREDICATE_ALIGN16_REPLICATE_W
;
1343 debug_printf("Unexpected predicate: 0x%08x\n",
1344 inst
->dst
.CondMask
);
1345 return BRW_PREDICATE_NORMAL
;
1348 return BRW_PREDICATE_NORMAL
;
1352 static void emit_insn(struct brw_vs_compile
*c
,
1353 const struct tgsi_full_instruction
*inst
)
1355 unsigned opcode
= inst
->Instruction
.Opcode
;
1356 unsigned label
= inst
->Label
.Label
;
1357 struct brw_compile
*p
= &c
->func
;
1358 struct brw_reg args
[3], dst
;
1362 printf("%d: ", insn
);
1363 _mesa_print_instruction(inst
);
1366 /* Get argument regs.
1368 for (i
= 0; i
< 3; i
++) {
1369 args
[i
] = get_arg(c
, &inst
->Src
[i
], i
);
1372 /* Get dest regs. Note that it is possible for a reg to be both
1373 * dst and arg, given the static allocation of registers. So
1374 * care needs to be taken emitting multi-operation instructions.
1377 inst
->Dst
[0].Register
.File
,
1378 inst
->Dst
[0].Register
.Index
,
1379 inst
->Dst
[0].Register
.WriteMask
);
1383 if (inst
->Instruction
.Saturate
!= TGSI_SAT_NONE
) {
1384 debug_printf("Unsupported saturate in vertex shader");
1388 case TGSI_OPCODE_ABS
:
1389 brw_MOV(p
, dst
, brw_abs(args
[0]));
1391 case TGSI_OPCODE_ADD
:
1392 brw_ADD(p
, dst
, args
[0], args
[1]);
1394 case TGSI_OPCODE_COS
:
1395 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1397 case TGSI_OPCODE_DP3
:
1398 brw_DP3(p
, dst
, args
[0], args
[1]);
1400 case TGSI_OPCODE_DP4
:
1401 brw_DP4(p
, dst
, args
[0], args
[1]);
1403 case TGSI_OPCODE_DPH
:
1404 brw_DPH(p
, dst
, args
[0], args
[1]);
1406 case TGSI_OPCODE_NRM
:
1407 emit_nrm(c
, dst
, args
[0], 3);
1409 case TGSI_OPCODE_NRM4
:
1410 emit_nrm(c
, dst
, args
[0], 4);
1412 case TGSI_OPCODE_DST
:
1413 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1415 case TGSI_OPCODE_EXP
:
1416 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1418 case TGSI_OPCODE_EX2
:
1419 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1421 case TGSI_OPCODE_ARL
:
1422 emit_arl(c
, dst
, args
[0]);
1424 case TGSI_OPCODE_FLR
:
1425 brw_RNDD(p
, dst
, args
[0]);
1427 case TGSI_OPCODE_FRC
:
1428 brw_FRC(p
, dst
, args
[0]);
1430 case TGSI_OPCODE_LOG
:
1431 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1433 case TGSI_OPCODE_LG2
:
1434 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1436 case TGSI_OPCODE_LIT
:
1437 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1439 case TGSI_OPCODE_LRP
:
1440 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1442 case TGSI_OPCODE_MAD
:
1443 brw_MOV(p
, brw_acc_reg(), args
[2]);
1444 brw_MAC(p
, dst
, args
[0], args
[1]);
1446 case TGSI_OPCODE_MAX
:
1447 emit_max(p
, dst
, args
[0], args
[1]);
1449 case TGSI_OPCODE_MIN
:
1450 emit_min(p
, dst
, args
[0], args
[1]);
1452 case TGSI_OPCODE_MOV
:
1453 brw_MOV(p
, dst
, args
[0]);
1455 case TGSI_OPCODE_MUL
:
1456 brw_MUL(p
, dst
, args
[0], args
[1]);
1458 case TGSI_OPCODE_POW
:
1459 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1461 case TGSI_OPCODE_RCP
:
1462 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1464 case TGSI_OPCODE_RSQ
:
1465 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
,
1466 brw_swizzle(args
[0], 0,0,0,0), BRW_MATH_PRECISION_FULL
);
1468 case TGSI_OPCODE_SEQ
:
1469 emit_seq(p
, dst
, args
[0], args
[1]);
1471 case TGSI_OPCODE_SIN
:
1472 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1474 case TGSI_OPCODE_SNE
:
1475 emit_sne(p
, dst
, args
[0], args
[1]);
1477 case TGSI_OPCODE_SGE
:
1478 emit_sge(p
, dst
, args
[0], args
[1]);
1480 case TGSI_OPCODE_SGT
:
1481 emit_sgt(p
, dst
, args
[0], args
[1]);
1483 case TGSI_OPCODE_SLT
:
1484 emit_slt(p
, dst
, args
[0], args
[1]);
1486 case TGSI_OPCODE_SLE
:
1487 emit_sle(p
, dst
, args
[0], args
[1]);
1489 case TGSI_OPCODE_SUB
:
1490 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1492 case TGSI_OPCODE_TRUNC
:
1493 /* round toward zero */
1494 brw_RNDZ(p
, dst
, args
[0]);
1496 case TGSI_OPCODE_XPD
:
1497 emit_xpd(p
, dst
, args
[0], args
[1]);
1499 case TGSI_OPCODE_IF
:
1500 assert(c
->if_depth
< MAX_IF_DEPTH
);
1501 c
->if_inst
[c
->if_depth
] = brw_IF(p
, BRW_EXECUTE_8
);
1502 /* Note that brw_IF smashes the predicate_control field. */
1503 c
->if_inst
[c
->if_depth
]->header
.predicate_control
= get_predicate(inst
);
1506 case TGSI_OPCODE_ELSE
:
1507 c
->if_inst
[c
->if_depth
-1] = brw_ELSE(p
, c
->if_inst
[c
->if_depth
-1]);
1509 case TGSI_OPCODE_ENDIF
:
1510 assert(c
->if_depth
> 0);
1511 brw_ENDIF(p
, c
->if_inst
[--c
->if_depth
]);
1513 case TGSI_OPCODE_BGNLOOP
:
1514 c
->loop_inst
[c
->loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1516 case TGSI_OPCODE_BRK
:
1517 brw_set_predicate_control(p
, get_predicate(inst
));
1519 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1521 case TGSI_OPCODE_CONT
:
1522 brw_set_predicate_control(p
, get_predicate(inst
));
1524 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1526 case TGSI_OPCODE_ENDLOOP
:
1528 struct brw_instruction
*inst0
, *inst1
;
1533 if (c
->chipset
.is_igdng
)
1536 inst0
= inst1
= brw_WHILE(p
, c
->loop_inst
[c
->loop_depth
]);
1537 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1538 while (inst0
> c
->loop_inst
[c
->loop_depth
]) {
1540 if (inst0
->header
.opcode
== TGSI_OPCODE_BRK
) {
1541 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
+ 1);
1542 inst0
->bits3
.if_else
.pop_count
= 0;
1544 else if (inst0
->header
.opcode
== TGSI_OPCODE_CONT
) {
1545 inst0
->bits3
.if_else
.jump_count
= br
* (inst1
- inst0
);
1546 inst0
->bits3
.if_else
.pop_count
= 0;
1551 case TGSI_OPCODE_BRA
:
1552 brw_set_predicate_control(p
, get_predicate(inst
));
1553 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1554 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1556 case TGSI_OPCODE_CAL
:
1557 brw_set_access_mode(p
, BRW_ALIGN_1
);
1558 brw_ADD(p
, deref_1d(c
->stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1559 brw_set_access_mode(p
, BRW_ALIGN_16
);
1560 brw_ADD(p
, get_addr_reg(c
->stack_index
),
1561 get_addr_reg(c
->stack_index
), brw_imm_d(4));
1562 brw_save_call(p
, label
, p
->nr_insn
);
1563 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1565 case TGSI_OPCODE_RET
:
1566 brw_ADD(p
, get_addr_reg(c
->stack_index
),
1567 get_addr_reg(c
->stack_index
), brw_imm_d(-4));
1568 brw_set_access_mode(p
, BRW_ALIGN_1
);
1569 brw_MOV(p
, brw_ip_reg(), deref_1d(c
->stack_index
, 0));
1570 brw_set_access_mode(p
, BRW_ALIGN_16
);
1572 case TGSI_OPCODE_END
:
1573 c
->end_offset
= p
->nr_insn
;
1574 /* this instruction will get patched later to jump past subroutine
1577 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1579 case TGSI_OPCODE_BGNSUB
:
1580 brw_save_label(p
, p
->nr_insn
, p
->nr_insn
);
1582 case TGSI_OPCODE_ENDSUB
:
1586 debug_printf("Unsupported opcode %i (%s) in vertex shader",
1588 tgsi_get_opcode_name(opcode
));
1591 /* Set the predication update on the last instruction of the native
1592 * instruction sequence.
1594 * This would be problematic if it was set on a math instruction,
1595 * but that shouldn't be the case with the current GLSL compiler.
1600 if (inst
->CondUpdate
) {
1601 struct brw_instruction
*hw_insn
= &p
->store
[p
->nr_insn
- 1];
1603 assert(hw_insn
->header
.destreg__conditionalmod
== 0);
1604 hw_insn
->header
.destreg__conditionalmod
= BRW_CONDITIONAL_NZ
;
1612 /* Emit the vertex program instructions here.
1614 void brw_vs_emit(struct brw_vs_compile
*c
)
1616 struct brw_compile
*p
= &c
->func
;
1617 const struct tgsi_token
*tokens
= c
->vp
->tokens
;
1618 struct brw_instruction
*end_inst
, *last_inst
;
1619 struct tgsi_parse_context parse
;
1620 struct tgsi_full_instruction
*inst
;
1622 if (BRW_DEBUG
& DEBUG_VS
)
1623 tgsi_dump(c
->vp
->tokens
, 0);
1625 c
->stack_index
= brw_indirect(0, 0);
1627 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1628 brw_set_access_mode(p
, BRW_ALIGN_16
);
1631 /* Static register allocation
1633 brw_vs_alloc_regs(c
);
1635 if (c
->vp
->has_flow_control
) {
1636 brw_MOV(p
, get_addr_reg(c
->stack_index
), brw_address(c
->stack
));
1641 tgsi_parse_init( &parse
, tokens
);
1642 while( !tgsi_parse_end_of_tokens( &parse
) ) {
1643 tgsi_parse_token( &parse
);
1645 switch( parse
.FullToken
.Token
.Type
) {
1646 case TGSI_TOKEN_TYPE_DECLARATION
:
1647 case TGSI_TOKEN_TYPE_IMMEDIATE
:
1650 case TGSI_TOKEN_TYPE_INSTRUCTION
:
1651 inst
= &parse
.FullToken
.FullInstruction
;
1652 emit_insn( c
, inst
);
1659 tgsi_parse_free( &parse
);
1661 end_inst
= &p
->store
[c
->end_offset
];
1662 last_inst
= &p
->store
[p
->nr_insn
];
1664 /* The END instruction will be patched to jump to this code */
1665 emit_vertex_write(c
);
1667 post_vs_emit(c
, end_inst
, last_inst
);
1669 if (BRW_DEBUG
& DEBUG_VS
) {
1670 debug_printf("vs-native:\n");
1671 brw_disasm(stderr
, p
->store
, p
->nr_insn
);