2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
32 #include "brw_context.h"
35 #include "pipe/p_shader_tokens.h"
36 #include "tgsi/tgsi_parse.h"
38 struct brw_prog_info
{
43 unsigned writes_psize
;
46 unsigned result_edge_idx
;
47 unsigned edge_flag_idx
;
51 /* Do things as simply as possible. Allocate and populate all regs
54 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
,
55 struct brw_prog_info
*info
)
57 unsigned i
, reg
= 0, mrf
;
60 /* r0 -- reserved as usual
62 c
->r0
= brw_vec8_grf(reg
, 0); reg
++;
64 /* User clip planes from curbe:
66 if (c
->key
.nr_userclip
) {
67 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
68 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
71 /* Deal with curbe alignment:
73 reg
+= ((6+c
->key
.nr_userclip
+3)/4)*2;
76 /* Vertex program parameters from curbe:
78 nr_params
= c
->prog_data
.max_const
;
79 for (i
= 0; i
< nr_params
; i
++) {
80 c
->regs
[TGSI_FILE_CONSTANT
][i
] = stride(brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
82 reg
+= (nr_params
+1)/2;
83 c
->prog_data
.curb_read_length
= reg
- 1;
87 /* Allocate input regs:
89 c
->nr_inputs
= c
->vp
->info
.num_inputs
;
90 for (i
= 0; i
< c
->nr_inputs
; i
++) {
91 c
->regs
[TGSI_FILE_INPUT
][i
] = brw_vec8_grf(reg
, 0);
96 /* Allocate outputs: TODO: could organize the non-position outputs
97 * to go straight into message regs.
100 c
->first_output
= reg
;
102 for (i
= 0; i
< c
->vp
->info
.num_outputs
; i
++) {
105 if (i
== VERT_RESULT_HPOS
) {
106 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
109 else if (i
== VERT_RESULT_PSIZ
) {
110 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
112 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
115 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_message_reg(mrf
);
119 /*treat pos differently for now */
120 if (i
== info
->pos_idx
) {
121 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
124 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_message_reg(mrf
);
130 /* Allocate program temporaries:
132 for (i
= 0; i
< info
->num_temps
; i
++) {
133 c
->regs
[TGSI_FILE_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
137 /* Address reg(s). Don't try to use the internal address reg until
140 for (i
= 0; i
< info
->num_addrs
; i
++) {
141 c
->regs
[TGSI_FILE_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
145 BRW_VERTICAL_STRIDE_8
,
147 BRW_HORIZONTAL_STRIDE_1
,
153 for (i
= 0; i
< 128; i
++) {
154 if (c
->output_regs
[i
].used_in_src
) {
155 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
160 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
164 /* Some opcodes need an internal temporary:
167 c
->last_tmp
= reg
; /* for allocation purposes */
169 /* Each input reg holds data from two vertices. The
170 * urb_read_length is the number of registers read from *each*
171 * vertex urb, so is half the amount:
173 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+1)/2;
175 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+2+3)/4;
176 c
->prog_data
.total_grf
= reg
;
180 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
182 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
184 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
185 c
->prog_data
.total_grf
= c
->last_tmp
;
190 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
192 if (tmp
.nr
== c
->last_tmp
-1)
196 static void release_tmps( struct brw_vs_compile
*c
)
198 c
->last_tmp
= c
->first_tmp
;
202 static void unalias1( struct brw_vs_compile
*c
,
205 void (*func
)( struct brw_vs_compile
*,
209 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
210 struct brw_compile
*p
= &c
->func
;
211 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
213 brw_MOV(p
, dst
, tmp
);
220 static void unalias2( struct brw_vs_compile
*c
,
224 void (*func
)( struct brw_vs_compile
*,
229 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
230 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
231 struct brw_compile
*p
= &c
->func
;
232 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
233 func(c
, tmp
, arg0
, arg1
);
234 brw_MOV(p
, dst
, tmp
);
237 func(c
, dst
, arg0
, arg1
);
241 static void emit_sop( struct brw_compile
*p
,
247 brw_push_insn_state(p
);
248 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
249 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
250 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
251 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
252 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
253 brw_pop_insn_state(p
);
256 static void emit_seq( struct brw_compile
*p
,
259 struct brw_reg arg1
)
261 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
264 static void emit_sne( struct brw_compile
*p
,
267 struct brw_reg arg1
)
269 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
271 static void emit_slt( struct brw_compile
*p
,
274 struct brw_reg arg1
)
276 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
279 static void emit_sle( struct brw_compile
*p
,
282 struct brw_reg arg1
)
284 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
287 static void emit_sgt( struct brw_compile
*p
,
290 struct brw_reg arg1
)
292 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
295 static void emit_sge( struct brw_compile
*p
,
298 struct brw_reg arg1
)
300 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
303 static void emit_max( struct brw_compile
*p
,
306 struct brw_reg arg1
)
308 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
309 brw_SEL(p
, dst
, arg1
, arg0
);
310 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
313 static void emit_min( struct brw_compile
*p
,
316 struct brw_reg arg1
)
318 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
319 brw_SEL(p
, dst
, arg0
, arg1
);
320 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
324 static void emit_math1( struct brw_vs_compile
*c
,
330 /* There are various odd behaviours with SEND on the simulator. In
331 * addition there are documented issues with the fact that the GEN4
332 * processor doesn't do dependency control properly on SEND
333 * results. So, on balance, this kludge to get around failures
334 * with writemasked math results looks like it might be necessary
335 * whether that turns out to be a simulator bug or not:
337 struct brw_compile
*p
= &c
->func
;
338 struct brw_reg tmp
= dst
;
339 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
340 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
348 BRW_MATH_SATURATE_NONE
,
351 BRW_MATH_DATA_SCALAR
,
355 brw_MOV(p
, dst
, tmp
);
360 static void emit_math2( struct brw_vs_compile
*c
,
367 struct brw_compile
*p
= &c
->func
;
368 struct brw_reg tmp
= dst
;
369 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
370 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
375 brw_MOV(p
, brw_message_reg(3), arg1
);
380 BRW_MATH_SATURATE_NONE
,
383 BRW_MATH_DATA_SCALAR
,
387 brw_MOV(p
, dst
, tmp
);
394 static void emit_exp_noalias( struct brw_vs_compile
*c
,
396 struct brw_reg arg0
)
398 struct brw_compile
*p
= &c
->func
;
401 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_X
) {
402 struct brw_reg tmp
= get_tmp(c
);
403 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
405 /* tmp_d = floor(arg0.x) */
406 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
408 /* result[0] = 2.0 ^ tmp */
410 /* Adjust exponent for floating point:
413 brw_ADD(p
, brw_writemask(tmp_d
, TGSI_WRITEMASK_X
), tmp_d
, brw_imm_d(127));
415 /* Install exponent and sign.
416 * Excess drops off the edge:
418 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), TGSI_WRITEMASK_X
),
419 tmp_d
, brw_imm_d(23));
424 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Y
) {
425 /* result[1] = arg0.x - floor(arg0.x) */
426 brw_FRC(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
429 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
) {
430 /* As with the LOG instruction, we might be better off just
431 * doing a taylor expansion here, seeing as we have to do all
434 * If mathbox partial precision is too low, consider also:
435 * result[3] = result[0] * EXP(result[1])
438 BRW_MATH_FUNCTION_EXP
,
439 brw_writemask(dst
, TGSI_WRITEMASK_Z
),
440 brw_swizzle1(arg0
, 0),
441 BRW_MATH_PRECISION_PARTIAL
);
444 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
) {
445 /* result[3] = 1.0; */
446 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_W
), brw_imm_f(1));
451 static void emit_log_noalias( struct brw_vs_compile
*c
,
453 struct brw_reg arg0
)
455 struct brw_compile
*p
= &c
->func
;
456 struct brw_reg tmp
= dst
;
457 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
458 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
459 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
460 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
464 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
467 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
470 * These almost look likey they could be joined up, but not really
473 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
474 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
476 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_XZ
) {
478 brw_writemask(tmp_ud
, TGSI_WRITEMASK_X
),
479 brw_swizzle1(arg0_ud
, 0),
480 brw_imm_ud((1U<<31)-1));
483 brw_writemask(tmp_ud
, TGSI_WRITEMASK_X
),
488 brw_writemask(tmp
, TGSI_WRITEMASK_X
),
489 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
493 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_YZ
) {
495 brw_writemask(tmp_ud
, TGSI_WRITEMASK_Y
),
496 brw_swizzle1(arg0_ud
, 0),
497 brw_imm_ud((1<<23)-1));
500 brw_writemask(tmp_ud
, TGSI_WRITEMASK_Y
),
502 brw_imm_ud(127<<23));
505 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
) {
506 /* result[2] = result[0] + LOG2(result[1]); */
508 /* Why bother? The above is just a hint how to do this with a
509 * taylor series. Maybe we *should* use a taylor series as by
510 * the time all the above has been done it's almost certainly
511 * quicker than calling the mathbox, even with low precision.
514 * - result[0] + mathbox.LOG2(result[1])
515 * - mathbox.LOG2(arg0.x)
516 * - result[0] + inline_taylor_approx(result[1])
519 BRW_MATH_FUNCTION_LOG
,
520 brw_writemask(tmp
, TGSI_WRITEMASK_Z
),
521 brw_swizzle1(tmp
, 1),
522 BRW_MATH_PRECISION_FULL
);
525 brw_writemask(tmp
, TGSI_WRITEMASK_Z
),
526 brw_swizzle1(tmp
, 2),
527 brw_swizzle1(tmp
, 0));
530 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
) {
531 /* result[3] = 1.0; */
532 brw_MOV(p
, brw_writemask(tmp
, TGSI_WRITEMASK_W
), brw_imm_f(1));
536 brw_MOV(p
, dst
, tmp
);
544 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
546 static void emit_dst_noalias( struct brw_vs_compile
*c
,
551 struct brw_compile
*p
= &c
->func
;
553 /* There must be a better way to do this:
555 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_X
)
556 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_X
), brw_imm_f(1.0));
557 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Y
)
558 brw_MUL(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), arg0
, arg1
);
559 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
)
560 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_Z
), arg0
);
561 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
)
562 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_W
), arg1
);
565 static void emit_xpd( struct brw_compile
*p
,
570 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
571 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
576 static void emit_lit_noalias( struct brw_vs_compile
*c
,
578 struct brw_reg arg0
)
580 struct brw_compile
*p
= &c
->func
;
581 struct brw_instruction
*if_insn
;
582 struct brw_reg tmp
= dst
;
583 boolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
588 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_YZ
), brw_imm_f(0));
589 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_XW
), brw_imm_f(1));
591 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
592 * to get all channels active inside the IF. In the clipping code
593 * we run with NoMask, so it's not an option and we can use
594 * BRW_EXECUTE_1 for all comparisions.
596 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
597 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
599 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), brw_swizzle1(arg0
,0));
601 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
602 brw_MOV(p
, brw_writemask(tmp
, TGSI_WRITEMASK_Z
), brw_swizzle1(arg0
,1));
603 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
606 BRW_MATH_FUNCTION_POW
,
607 brw_writemask(dst
, TGSI_WRITEMASK_Z
),
608 brw_swizzle1(tmp
, 2),
609 brw_swizzle1(arg0
, 3),
610 BRW_MATH_PRECISION_PARTIAL
);
613 brw_ENDIF(p
, if_insn
);
620 /* TODO: relative addressing!
622 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
627 case TGSI_FILE_TEMPORARY
:
628 case TGSI_FILE_INPUT
:
629 case TGSI_FILE_OUTPUT
:
630 assert(c
->regs
[file
][index
].nr
!= 0);
631 return c
->regs
[file
][index
];
632 case TGSI_FILE_CONSTANT
:
633 assert(c
->regs
[TGSI_FILE_CONSTANT
][index
+ c
->prog_data
.num_imm
].nr
!= 0);
634 return c
->regs
[TGSI_FILE_CONSTANT
][index
+ c
->prog_data
.num_imm
];
635 case TGSI_FILE_IMMEDIATE
:
636 assert(c
->regs
[TGSI_FILE_CONSTANT
][index
].nr
!= 0);
637 return c
->regs
[TGSI_FILE_CONSTANT
][index
];
638 case TGSI_FILE_ADDRESS
:
640 return c
->regs
[file
][index
];
642 case TGSI_FILE_NULL
: /* undef values */
643 return brw_null_reg();
647 return brw_null_reg();
653 static struct brw_reg
deref( struct brw_vs_compile
*c
,
657 struct brw_compile
*p
= &c
->func
;
658 struct brw_reg tmp
= vec4(get_tmp(c
));
659 struct brw_reg vp_address
= retype(vec1(get_reg(c
, TGSI_FILE_ADDRESS
, 0)), BRW_REGISTER_TYPE_UW
);
660 unsigned byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
661 struct brw_reg indirect
= brw_vec4_indirect(0,0);
664 brw_push_insn_state(p
);
665 brw_set_access_mode(p
, BRW_ALIGN_1
);
667 /* This is pretty clunky - load the address register twice and
668 * fetch each 4-dword value in turn. There must be a way to do
669 * this in a single pass, but I couldn't get it to work.
671 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
672 brw_MOV(p
, tmp
, indirect
);
674 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
675 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
677 brw_pop_insn_state(p
);
684 static void emit_arl( struct brw_vs_compile
*c
,
686 struct brw_reg arg0
)
688 struct brw_compile
*p
= &c
->func
;
689 struct brw_reg tmp
= dst
;
690 boolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
695 brw_RNDD(p
, tmp
, arg0
);
696 brw_MUL(p
, dst
, tmp
, brw_imm_d(16));
703 /* Will return mangled results for SWZ op. The emit_swz() function
704 * ignores this result and recalculates taking extended swizzles into
707 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
708 struct tgsi_src_register
*src
)
712 if (src
->File
== TGSI_FILE_NULL
)
713 return brw_null_reg();
717 reg
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
->Index
);
720 reg
= get_reg(c
, src
->File
, src
->Index
);
722 /* Convert 3-bit swizzle to 2-bit.
724 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(src
->SwizzleX
,
729 /* Note this is ok for non-swizzle instructions:
731 reg
.negate
= src
->Negate
? 1 : 0;
737 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
738 const struct tgsi_dst_register
*dst
)
740 struct brw_reg reg
= get_reg(c
, dst
->File
, dst
->Index
);
742 reg
.dw1
.bits
.writemask
= dst
->WriteMask
;
750 static void emit_swz( struct brw_vs_compile
*c
,
752 struct tgsi_src_register src
)
754 struct brw_compile
*p
= &c
->func
;
755 unsigned zeros_mask
= 0;
756 unsigned ones_mask
= 0;
757 unsigned src_mask
= 0;
759 boolean need_tmp
= (src
.Negate
&&
760 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
761 struct brw_reg tmp
= dst
;
767 for (i
= 0; i
< 4; i
++) {
768 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
792 case TGSI_EXTSWIZZLE_ZERO
:
795 case TGSI_EXTSWIZZLE_ONE
:
802 /* Do src first, in case dst aliases src:
809 arg0
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
.Index
);
812 arg0
= get_reg(c
, src
.File
, src
.Index
);
814 arg0
= brw_swizzle(arg0
,
815 src_swz
[0], src_swz
[1],
816 src_swz
[2], src_swz
[3]);
818 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
822 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
825 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
828 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
831 brw_MOV(p
, dst
, tmp
);
838 /* Post-vertex-program processing. Send the results to the URB.
840 static void emit_vertex_write( struct brw_vs_compile
*c
, struct brw_prog_info
*info
)
842 struct brw_compile
*p
= &c
->func
;
843 struct brw_reg m0
= brw_message_reg(0);
844 struct brw_reg pos
= c
->regs
[TGSI_FILE_OUTPUT
][info
->pos_idx
];
847 if (c
->key
.copy_edgeflag
) {
849 get_reg(c
, TGSI_FILE_OUTPUT
, info
->result_edge_idx
),
850 get_reg(c
, TGSI_FILE_INPUT
, info
->edge_flag_idx
));
854 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
856 if (!c
->key
.know_w_is_one
) {
858 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
859 brw_MUL(p
, brw_writemask(ndc
, TGSI_WRITEMASK_XYZ
), pos
, ndc
);
865 /* This includes the workaround for -ve rhw, so is no longer an
868 if (info
->writes_psize
||
869 c
->key
.nr_userclip
||
870 !c
->key
.know_w_is_one
)
872 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
875 brw_MOV(p
, header1
, brw_imm_ud(0));
877 brw_set_access_mode(p
, BRW_ALIGN_16
);
879 if (info
->writes_psize
) {
880 struct brw_reg psiz
= c
->regs
[TGSI_FILE_OUTPUT
][info
->psize_idx
];
881 brw_MUL(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
),
882 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
883 brw_AND(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
,
884 brw_imm_ud(0x7ff<<8));
888 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
889 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
890 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
891 brw_OR(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
892 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
896 /* i965 clipping workaround:
897 * 1) Test for -ve rhw
899 * set ndc = (0,0,0,0)
902 * Later, clipping will detect ucp[6] and ensure the primitive is
903 * clipped against all fixed planes.
905 if (!c
->key
.know_w_is_one
) {
907 vec8(brw_null_reg()),
909 brw_swizzle1(ndc
, 3),
912 brw_OR(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
913 brw_MOV(p
, ndc
, brw_imm_f(0));
914 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
917 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
918 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
919 brw_set_access_mode(p
, BRW_ALIGN_16
);
921 release_tmp(c
, header1
);
924 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
928 /* Emit the (interleaved) headers for the two vertices - an 8-reg
929 * of zeros followed by two sets of NDC coordinates:
931 brw_set_access_mode(p
, BRW_ALIGN_1
);
932 brw_MOV(p
, offset(m0
, 2), ndc
);
933 brw_MOV(p
, offset(m0
, 3), pos
);
937 brw_null_reg(), /* dest */
938 0, /* starting mrf reg nr */
942 c
->nr_outputs
+ 3, /* msg len */
943 0, /* response len */
945 1, /* writes complete */
946 0, /* urb destination offset */
947 BRW_URB_SWIZZLE_INTERLEAVE
);
952 post_vs_emit( struct brw_vs_compile
*c
, struct brw_instruction
*end_inst
)
954 struct tgsi_parse_context parse
;
955 const struct tgsi_token
*tokens
= c
->vp
->program
.tokens
;
956 tgsi_parse_init(&parse
, tokens
);
957 while (!tgsi_parse_end_of_tokens(&parse
)) {
958 tgsi_parse_token(&parse
);
959 if (parse
.FullToken
.Token
.Type
== TGSI_TOKEN_TYPE_INSTRUCTION
) {
961 struct brw_instruction
*brw_inst1
, *brw_inst2
;
962 const struct tgsi_full_instruction
*inst1
, *inst2
;
964 inst1
= &parse
.FullToken
.FullInstruction
;
965 brw_inst1
= inst1
->Data
;
966 switch (inst1
->Opcode
) {
967 case TGSI_OPCODE_CAL
:
968 case TGSI_OPCODE_BRA
:
969 target_insn
= inst1
->BranchTarget
;
970 inst2
= &c
->vp
->program
.Base
.Instructions
[target_insn
];
971 brw_inst2
= inst2
->Data
;
972 offset
= brw_inst2
- brw_inst1
;
973 brw_set_src1(brw_inst1
, brw_imm_d(offset
*16));
975 case TGSI_OPCODE_END
:
976 offset
= end_inst
- brw_inst1
;
977 brw_set_src1(brw_inst1
, brw_imm_d(offset
*16));
985 tgsi_parse_free(&parse
);
988 static void process_declaration(const struct tgsi_full_declaration
*decl
,
989 struct brw_prog_info
*info
)
991 int first
= decl
->DeclarationRange
.First
;
992 int last
= decl
->DeclarationRange
.Last
;
994 switch(decl
->Declaration
.File
) {
995 case TGSI_FILE_CONSTANT
:
996 info
->num_consts
+= last
- first
+ 1;
998 case TGSI_FILE_INPUT
: {
1001 case TGSI_FILE_OUTPUT
: {
1002 assert(last
== first
); /* for now */
1003 if (decl
->Declaration
.Semantic
) {
1004 switch (decl
->Semantic
.SemanticName
) {
1005 case TGSI_SEMANTIC_POSITION
: {
1006 info
->pos_idx
= first
;
1009 case TGSI_SEMANTIC_COLOR
:
1011 case TGSI_SEMANTIC_BCOLOR
:
1013 case TGSI_SEMANTIC_FOG
:
1015 case TGSI_SEMANTIC_PSIZE
: {
1016 info
->writes_psize
= TRUE
;
1017 info
->psize_idx
= first
;
1020 case TGSI_SEMANTIC_GENERIC
:
1026 case TGSI_FILE_TEMPORARY
: {
1027 info
->num_temps
+= (last
- first
) + 1;
1030 case TGSI_FILE_SAMPLER
: {
1033 case TGSI_FILE_ADDRESS
: {
1034 info
->num_addrs
+= (last
- first
) + 1;
1037 case TGSI_FILE_IMMEDIATE
: {
1040 case TGSI_FILE_NULL
: {
1046 static void process_instruction(struct brw_vs_compile
*c
,
1047 struct tgsi_full_instruction
*inst
,
1048 struct brw_prog_info
*info
)
1050 struct brw_reg args
[3], dst
;
1051 struct brw_compile
*p
= &c
->func
;
1052 /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
1056 /*FIXME: might not be the only one*/
1057 const struct tgsi_dst_register
*dst_reg
= &inst
->FullDstRegisters
[0].DstRegister
;
1059 struct brw_instruction *if_inst[MAX_IFSN];
1060 unsigned insn, if_insn = 0;
1063 for (i
= 0; i
< 3; i
++) {
1064 struct tgsi_full_src_register
*src
= &inst
->FullSrcRegisters
[i
];
1065 index
= src
->SrcRegister
.Index
;
1066 file
= src
->SrcRegister
.File
;
1067 if (file
== TGSI_FILE_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1068 args
[i
] = c
->output_regs
[index
].reg
;
1070 args
[i
] = get_arg(c
, &src
->SrcRegister
);
1073 /* Get dest regs. Note that it is possible for a reg to be both
1074 * dst and arg, given the static allocation of registers. So
1075 * care needs to be taken emitting multi-operation instructions.
1077 index
= dst_reg
->Index
;
1078 file
= dst_reg
->File
;
1079 if (file
== TGSI_FILE_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1080 dst
= c
->output_regs
[index
].reg
;
1082 dst
= get_dst(c
, dst_reg
);
1084 switch (inst
->Instruction
.Opcode
) {
1085 case TGSI_OPCODE_ABS
:
1086 brw_MOV(p
, dst
, brw_abs(args
[0]));
1088 case TGSI_OPCODE_ADD
:
1089 brw_ADD(p
, dst
, args
[0], args
[1]);
1091 case TGSI_OPCODE_DP3
:
1092 brw_DP3(p
, dst
, args
[0], args
[1]);
1094 case TGSI_OPCODE_DP4
:
1095 brw_DP4(p
, dst
, args
[0], args
[1]);
1097 case TGSI_OPCODE_DPH
:
1098 brw_DPH(p
, dst
, args
[0], args
[1]);
1100 case TGSI_OPCODE_DST
:
1101 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1103 case TGSI_OPCODE_EXP
:
1104 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1106 case TGSI_OPCODE_EX2
:
1107 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1109 case TGSI_OPCODE_ARL
:
1110 emit_arl(c
, dst
, args
[0]);
1112 case TGSI_OPCODE_FLR
:
1113 brw_RNDD(p
, dst
, args
[0]);
1115 case TGSI_OPCODE_FRC
:
1116 brw_FRC(p
, dst
, args
[0]);
1118 case TGSI_OPCODE_LOG
:
1119 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1121 case TGSI_OPCODE_LG2
:
1122 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1124 case TGSI_OPCODE_LIT
:
1125 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1127 case TGSI_OPCODE_MAD
:
1128 brw_MOV(p
, brw_acc_reg(), args
[2]);
1129 brw_MAC(p
, dst
, args
[0], args
[1]);
1131 case TGSI_OPCODE_MAX
:
1132 emit_max(p
, dst
, args
[0], args
[1]);
1134 case TGSI_OPCODE_MIN
:
1135 emit_min(p
, dst
, args
[0], args
[1]);
1137 case TGSI_OPCODE_MOV
:
1138 case TGSI_OPCODE_SWZ
:
1140 /* The args[0] value can't be used here as it won't have
1141 * correctly encoded the full swizzle:
1143 emit_swz(c
, dst
, inst
->SrcReg
[0] );
1145 brw_MOV(p
, dst
, args
[0]);
1147 case TGSI_OPCODE_MUL
:
1148 brw_MUL(p
, dst
, args
[0], args
[1]);
1150 case TGSI_OPCODE_POW
:
1151 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1153 case TGSI_OPCODE_RCP
:
1154 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1156 case TGSI_OPCODE_RSQ
:
1157 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1160 case TGSI_OPCODE_SEQ
:
1161 emit_seq(p
, dst
, args
[0], args
[1]);
1163 case TGSI_OPCODE_SNE
:
1164 emit_sne(p
, dst
, args
[0], args
[1]);
1166 case TGSI_OPCODE_SGE
:
1167 emit_sge(p
, dst
, args
[0], args
[1]);
1169 case TGSI_OPCODE_SGT
:
1170 emit_sgt(p
, dst
, args
[0], args
[1]);
1172 case TGSI_OPCODE_SLT
:
1173 emit_slt(p
, dst
, args
[0], args
[1]);
1175 case TGSI_OPCODE_SLE
:
1176 emit_sle(p
, dst
, args
[0], args
[1]);
1178 case TGSI_OPCODE_SUB
:
1179 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1181 case TGSI_OPCODE_XPD
:
1182 emit_xpd(p
, dst
, args
[0], args
[1]);
1185 case TGSI_OPCODE_IF
:
1186 assert(if_insn
< MAX_IFSN
);
1187 if_inst
[if_insn
++] = brw_IF(p
, BRW_EXECUTE_8
);
1189 case TGSI_OPCODE_ELSE
:
1190 if_inst
[if_insn
-1] = brw_ELSE(p
, if_inst
[if_insn
-1]);
1192 case TGSI_OPCODE_ENDIF
:
1193 assert(if_insn
> 0);
1194 brw_ENDIF(p
, if_inst
[--if_insn
]);
1196 case TGSI_OPCODE_BRA
:
1197 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1198 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1199 brw_set_predicate_control_flag_value(p
, 0xff);
1201 case TGSI_OPCODE_CAL
:
1202 brw_set_access_mode(p
, BRW_ALIGN_1
);
1203 brw_ADD(p
, deref_1uw(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1204 brw_set_access_mode(p
, BRW_ALIGN_16
);
1205 brw_ADD(p
, get_addr_reg(stack_index
),
1206 get_addr_reg(stack_index
), brw_imm_d(4));
1207 inst
->Data
= &p
->store
[p
->nr_insn
];
1208 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1211 case TGSI_OPCODE_RET
:
1213 brw_ADD(p
, get_addr_reg(stack_index
),
1214 get_addr_reg(stack_index
), brw_imm_d(-4));
1215 brw_set_access_mode(p
, BRW_ALIGN_1
);
1216 brw_MOV(p
, brw_ip_reg(), deref_1uw(stack_index
, 0));
1217 brw_set_access_mode(p
, BRW_ALIGN_16
);
1219 /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
1222 case TGSI_OPCODE_END
:
1223 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1225 case TGSI_OPCODE_BGNSUB
:
1226 case TGSI_OPCODE_ENDSUB
:
1229 debug_printf("Unsupport opcode %d in vertex shader\n", inst
->Instruction
.Opcode
);
1233 if (dst_reg
->File
== TGSI_FILE_OUTPUT
1234 && dst_reg
->Index
!= info
->pos_idx
1235 && c
->output_regs
[dst_reg
->Index
].used_in_src
)
1236 brw_MOV(p
, get_dst(c
, dst_reg
), dst
);
1241 /* Emit the fragment program instructions here.
1243 void brw_vs_emit(struct brw_vs_compile
*c
)
1246 struct brw_compile
*p
= &c
->func
;
1247 struct brw_instruction
*end_inst
;
1248 struct tgsi_parse_context parse
;
1249 struct brw_indirect stack_index
= brw_indirect(0, 0);
1250 const struct tgsi_token
*tokens
= c
->vp
->program
.tokens
;
1251 struct brw_prog_info prog_info
;
1252 unsigned allocated_registers
= 0;
1253 memset(&prog_info
, 0, sizeof(struct brw_prog_info
));
1255 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1256 brw_set_access_mode(p
, BRW_ALIGN_16
);
1258 tgsi_parse_init(&parse
, tokens
);
1259 /* Message registers can't be read, so copy the output into GRF register
1260 if they are used in source registers */
1261 while (!tgsi_parse_end_of_tokens(&parse
)) {
1262 tgsi_parse_token(&parse
);
1264 switch (parse
.FullToken
.Token
.Type
) {
1265 case TGSI_TOKEN_TYPE_INSTRUCTION
: {
1266 const struct tgsi_full_instruction
*inst
= &parse
.FullToken
.FullInstruction
;
1267 for (i
= 0; i
< 3; ++i
) {
1268 const struct tgsi_src_register
*src
= &inst
->FullSrcRegisters
[i
].SrcRegister
;
1269 unsigned index
= src
->Index
;
1270 unsigned file
= src
->File
;
1271 if (file
== TGSI_FILE_OUTPUT
)
1272 c
->output_regs
[index
].used_in_src
= TRUE
;
1281 tgsi_parse_free(&parse
);
1283 tgsi_parse_init(&parse
, tokens
);
1285 while (!tgsi_parse_end_of_tokens(&parse
)) {
1286 tgsi_parse_token(&parse
);
1288 switch (parse
.FullToken
.Token
.Type
) {
1289 case TGSI_TOKEN_TYPE_DECLARATION
: {
1290 struct tgsi_full_declaration
*decl
= &parse
.FullToken
.FullDeclaration
;
1291 process_declaration(decl
, &prog_info
);
1294 case TGSI_TOKEN_TYPE_IMMEDIATE
: {
1295 struct tgsi_full_immediate
*imm
= &parse
.FullToken
.FullImmediate
;
1296 /*assert(imm->Immediate.Size == 4);*/
1297 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][0] = imm
->u
.ImmediateFloat32
[0].Float
;
1298 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][1] = imm
->u
.ImmediateFloat32
[1].Float
;
1299 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][2] = imm
->u
.ImmediateFloat32
[2].Float
;
1300 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][3] = imm
->u
.ImmediateFloat32
[3].Float
;
1301 c
->prog_data
.num_imm
++;
1304 case TGSI_TOKEN_TYPE_INSTRUCTION
: {
1305 struct tgsi_full_instruction
*inst
= &parse
.FullToken
.FullInstruction
;
1306 if (!allocated_registers
) {
1307 /* first instruction (declerations finished).
1308 * now that we know what vars are being used allocate
1309 * registers for them.*/
1310 c
->prog_data
.num_consts
= prog_info
.num_consts
;
1311 c
->prog_data
.max_const
= prog_info
.num_consts
+ c
->prog_data
.num_imm
;
1312 brw_vs_alloc_regs(c
, &prog_info
);
1314 brw_set_access_mode(p
, BRW_ALIGN_1
);
1315 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1316 brw_set_access_mode(p
, BRW_ALIGN_16
);
1317 allocated_registers
= 1;
1319 process_instruction(c
, inst
, &prog_info
);
1325 end_inst
= &p
->store
[p
->nr_insn
];
1326 emit_vertex_write(c
, &prog_info
);
1327 post_vs_emit(c
, end_inst
);
1328 tgsi_parse_free(&parse
);