2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
32 #include "brw_context.h"
35 #include "pipe/p_shader_tokens.h"
36 #include "tgsi/util/tgsi_parse.h"
38 struct brw_prog_info
{
43 unsigned writes_psize
;
46 unsigned result_edge_idx
;
47 unsigned edge_flag_idx
;
51 /* Do things as simply as possible. Allocate and populate all regs
54 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
,
55 struct brw_prog_info
*info
)
57 unsigned i
, reg
= 0, mrf
;
60 /* r0 -- reserved as usual
62 c
->r0
= brw_vec8_grf(reg
, 0); reg
++;
64 /* User clip planes from curbe:
66 if (c
->key
.nr_userclip
) {
67 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
68 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
71 /* Deal with curbe alignment:
73 reg
+= ((6+c
->key
.nr_userclip
+3)/4)*2;
76 /* Vertex program parameters from curbe:
78 nr_params
= c
->prog_data
.max_const
;
79 for (i
= 0; i
< nr_params
; i
++) {
80 c
->regs
[TGSI_FILE_CONSTANT
][i
] = stride(brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
82 reg
+= (nr_params
+1)/2;
83 c
->prog_data
.curb_read_length
= reg
- 1;
87 /* Allocate input regs:
89 c
->nr_inputs
= c
->vp
->info
.num_inputs
;
90 for (i
= 0; i
< c
->nr_inputs
; i
++) {
91 c
->regs
[TGSI_FILE_INPUT
][i
] = brw_vec8_grf(reg
, 0);
96 /* Allocate outputs: TODO: could organize the non-position outputs
97 * to go straight into message regs.
100 c
->first_output
= reg
;
102 for (i
= 0; i
< c
->vp
->info
.num_outputs
; i
++) {
105 if (i
== VERT_RESULT_HPOS
) {
106 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
109 else if (i
== VERT_RESULT_PSIZ
) {
110 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
112 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
115 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_message_reg(mrf
);
119 /*treat pos differently for now */
120 if (i
== info
->pos_idx
) {
121 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
124 c
->regs
[TGSI_FILE_OUTPUT
][i
] = brw_message_reg(mrf
);
130 /* Allocate program temporaries:
132 for (i
= 0; i
< info
->num_temps
; i
++) {
133 c
->regs
[TGSI_FILE_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
137 /* Address reg(s). Don't try to use the internal address reg until
140 for (i
= 0; i
< info
->num_addrs
; i
++) {
141 c
->regs
[TGSI_FILE_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
145 BRW_VERTICAL_STRIDE_8
,
147 BRW_HORIZONTAL_STRIDE_1
,
153 for (i
= 0; i
< 128; i
++) {
154 if (c
->output_regs
[i
].used_in_src
) {
155 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
160 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
164 /* Some opcodes need an internal temporary:
167 c
->last_tmp
= reg
; /* for allocation purposes */
169 /* Each input reg holds data from two vertices. The
170 * urb_read_length is the number of registers read from *each*
171 * vertex urb, so is half the amount:
173 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+1)/2;
175 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+2+3)/4;
176 c
->prog_data
.total_grf
= reg
;
180 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
182 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
184 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
185 c
->prog_data
.total_grf
= c
->last_tmp
;
190 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
192 if (tmp
.nr
== c
->last_tmp
-1)
196 static void release_tmps( struct brw_vs_compile
*c
)
198 c
->last_tmp
= c
->first_tmp
;
202 static void unalias1( struct brw_vs_compile
*c
,
205 void (*func
)( struct brw_vs_compile
*,
209 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
210 struct brw_compile
*p
= &c
->func
;
211 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
213 brw_MOV(p
, dst
, tmp
);
220 static void unalias2( struct brw_vs_compile
*c
,
224 void (*func
)( struct brw_vs_compile
*,
229 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
230 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
231 struct brw_compile
*p
= &c
->func
;
232 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
233 func(c
, tmp
, arg0
, arg1
);
234 brw_MOV(p
, dst
, tmp
);
237 func(c
, dst
, arg0
, arg1
);
241 static void emit_sop( struct brw_compile
*p
,
247 brw_push_insn_state(p
);
248 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
249 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
250 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
251 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
252 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
253 brw_pop_insn_state(p
);
256 static void emit_seq( struct brw_compile
*p
,
259 struct brw_reg arg1
)
261 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
264 static void emit_sne( struct brw_compile
*p
,
267 struct brw_reg arg1
)
269 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
271 static void emit_slt( struct brw_compile
*p
,
274 struct brw_reg arg1
)
276 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
279 static void emit_sle( struct brw_compile
*p
,
282 struct brw_reg arg1
)
284 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
287 static void emit_sgt( struct brw_compile
*p
,
290 struct brw_reg arg1
)
292 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
295 static void emit_sge( struct brw_compile
*p
,
298 struct brw_reg arg1
)
300 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
303 static void emit_max( struct brw_compile
*p
,
306 struct brw_reg arg1
)
308 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
309 brw_SEL(p
, dst
, arg1
, arg0
);
310 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
313 static void emit_min( struct brw_compile
*p
,
316 struct brw_reg arg1
)
318 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
319 brw_SEL(p
, dst
, arg0
, arg1
);
320 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
324 static void emit_math1( struct brw_vs_compile
*c
,
330 /* There are various odd behaviours with SEND on the simulator. In
331 * addition there are documented issues with the fact that the GEN4
332 * processor doesn't do dependency control properly on SEND
333 * results. So, on balance, this kludge to get around failures
334 * with writemasked math results looks like it might be necessary
335 * whether that turns out to be a simulator bug or not:
337 struct brw_compile
*p
= &c
->func
;
338 struct brw_reg tmp
= dst
;
339 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
340 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
348 BRW_MATH_SATURATE_NONE
,
351 BRW_MATH_DATA_SCALAR
,
355 brw_MOV(p
, dst
, tmp
);
360 static void emit_math2( struct brw_vs_compile
*c
,
367 struct brw_compile
*p
= &c
->func
;
368 struct brw_reg tmp
= dst
;
369 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
370 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
375 brw_MOV(p
, brw_message_reg(3), arg1
);
380 BRW_MATH_SATURATE_NONE
,
383 BRW_MATH_DATA_SCALAR
,
387 brw_MOV(p
, dst
, tmp
);
394 static void emit_exp_noalias( struct brw_vs_compile
*c
,
396 struct brw_reg arg0
)
398 struct brw_compile
*p
= &c
->func
;
401 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_X
) {
402 struct brw_reg tmp
= get_tmp(c
);
403 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
405 /* tmp_d = floor(arg0.x) */
406 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
408 /* result[0] = 2.0 ^ tmp */
410 /* Adjust exponent for floating point:
413 brw_ADD(p
, brw_writemask(tmp_d
, TGSI_WRITEMASK_X
), tmp_d
, brw_imm_d(127));
415 /* Install exponent and sign.
416 * Excess drops off the edge:
418 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), TGSI_WRITEMASK_X
),
419 tmp_d
, brw_imm_d(23));
424 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Y
) {
425 /* result[1] = arg0.x - floor(arg0.x) */
426 brw_FRC(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
429 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
) {
430 /* As with the LOG instruction, we might be better off just
431 * doing a taylor expansion here, seeing as we have to do all
434 * If mathbox partial precision is too low, consider also:
435 * result[3] = result[0] * EXP(result[1])
438 BRW_MATH_FUNCTION_EXP
,
439 brw_writemask(dst
, TGSI_WRITEMASK_Z
),
440 brw_swizzle1(arg0
, 0),
441 BRW_MATH_PRECISION_PARTIAL
);
444 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
) {
445 /* result[3] = 1.0; */
446 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_W
), brw_imm_f(1));
451 static void emit_log_noalias( struct brw_vs_compile
*c
,
453 struct brw_reg arg0
)
455 struct brw_compile
*p
= &c
->func
;
456 struct brw_reg tmp
= dst
;
457 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
458 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
459 boolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
460 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
464 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
467 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
470 * These almost look likey they could be joined up, but not really
473 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
474 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
476 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_XZ
) {
478 brw_writemask(tmp_ud
, TGSI_WRITEMASK_X
),
479 brw_swizzle1(arg0_ud
, 0),
480 brw_imm_ud((1U<<31)-1));
483 brw_writemask(tmp_ud
, TGSI_WRITEMASK_X
),
488 brw_writemask(tmp
, TGSI_WRITEMASK_X
),
489 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
493 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_YZ
) {
495 brw_writemask(tmp_ud
, TGSI_WRITEMASK_Y
),
496 brw_swizzle1(arg0_ud
, 0),
497 brw_imm_ud((1<<23)-1));
500 brw_writemask(tmp_ud
, TGSI_WRITEMASK_Y
),
502 brw_imm_ud(127<<23));
505 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
) {
506 /* result[2] = result[0] + LOG2(result[1]); */
508 /* Why bother? The above is just a hint how to do this with a
509 * taylor series. Maybe we *should* use a taylor series as by
510 * the time all the above has been done it's almost certainly
511 * quicker than calling the mathbox, even with low precision.
514 * - result[0] + mathbox.LOG2(result[1])
515 * - mathbox.LOG2(arg0.x)
516 * - result[0] + inline_taylor_approx(result[1])
519 BRW_MATH_FUNCTION_LOG
,
520 brw_writemask(tmp
, TGSI_WRITEMASK_Z
),
521 brw_swizzle1(tmp
, 1),
522 BRW_MATH_PRECISION_FULL
);
525 brw_writemask(tmp
, TGSI_WRITEMASK_Z
),
526 brw_swizzle1(tmp
, 2),
527 brw_swizzle1(tmp
, 0));
530 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
) {
531 /* result[3] = 1.0; */
532 brw_MOV(p
, brw_writemask(tmp
, TGSI_WRITEMASK_W
), brw_imm_f(1));
536 brw_MOV(p
, dst
, tmp
);
544 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
546 static void emit_dst_noalias( struct brw_vs_compile
*c
,
551 struct brw_compile
*p
= &c
->func
;
553 /* There must be a better way to do this:
555 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_X
)
556 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_X
), brw_imm_f(1.0));
557 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Y
)
558 brw_MUL(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), arg0
, arg1
);
559 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_Z
)
560 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_Z
), arg0
);
561 if (dst
.dw1
.bits
.writemask
& TGSI_WRITEMASK_W
)
562 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_W
), arg1
);
565 static void emit_xpd( struct brw_compile
*p
,
570 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
571 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
576 static void emit_lit_noalias( struct brw_vs_compile
*c
,
578 struct brw_reg arg0
)
580 struct brw_compile
*p
= &c
->func
;
581 struct brw_instruction
*if_insn
;
582 struct brw_reg tmp
= dst
;
583 boolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
588 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_YZ
), brw_imm_f(0));
589 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_XW
), brw_imm_f(1));
591 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
592 * to get all channels active inside the IF. In the clipping code
593 * we run with NoMask, so it's not an option and we can use
594 * BRW_EXECUTE_1 for all comparisions.
596 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
597 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
599 brw_MOV(p
, brw_writemask(dst
, TGSI_WRITEMASK_Y
), brw_swizzle1(arg0
,0));
601 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
602 brw_MOV(p
, brw_writemask(tmp
, TGSI_WRITEMASK_Z
), brw_swizzle1(arg0
,1));
603 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
606 BRW_MATH_FUNCTION_POW
,
607 brw_writemask(dst
, TGSI_WRITEMASK_Z
),
608 brw_swizzle1(tmp
, 2),
609 brw_swizzle1(arg0
, 3),
610 BRW_MATH_PRECISION_PARTIAL
);
613 brw_ENDIF(p
, if_insn
);
620 /* TODO: relative addressing!
622 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
627 case TGSI_FILE_TEMPORARY
:
628 case TGSI_FILE_INPUT
:
629 case TGSI_FILE_OUTPUT
:
630 assert(c
->regs
[file
][index
].nr
!= 0);
631 return c
->regs
[file
][index
];
632 case TGSI_FILE_CONSTANT
:
633 assert(c
->regs
[TGSI_FILE_CONSTANT
][index
+ c
->prog_data
.num_imm
].nr
!= 0);
634 return c
->regs
[TGSI_FILE_CONSTANT
][index
+ c
->prog_data
.num_imm
];
635 case TGSI_FILE_IMMEDIATE
:
636 assert(c
->regs
[TGSI_FILE_CONSTANT
][index
].nr
!= 0);
637 return c
->regs
[TGSI_FILE_CONSTANT
][index
];
638 case TGSI_FILE_ADDRESS
:
640 return c
->regs
[file
][index
];
642 case TGSI_FILE_NULL
: /* undef values */
643 return brw_null_reg();
647 return brw_null_reg();
653 static struct brw_reg
deref( struct brw_vs_compile
*c
,
657 struct brw_compile
*p
= &c
->func
;
658 struct brw_reg tmp
= vec4(get_tmp(c
));
659 struct brw_reg vp_address
= retype(vec1(get_reg(c
, TGSI_FILE_ADDRESS
, 0)), BRW_REGISTER_TYPE_UW
);
660 unsigned byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
661 struct brw_reg indirect
= brw_vec4_indirect(0,0);
664 brw_push_insn_state(p
);
665 brw_set_access_mode(p
, BRW_ALIGN_1
);
667 /* This is pretty clunky - load the address register twice and
668 * fetch each 4-dword value in turn. There must be a way to do
669 * this in a single pass, but I couldn't get it to work.
671 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
672 brw_MOV(p
, tmp
, indirect
);
674 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
675 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
677 brw_pop_insn_state(p
);
684 static void emit_arl( struct brw_vs_compile
*c
,
686 struct brw_reg arg0
)
688 struct brw_compile
*p
= &c
->func
;
689 struct brw_reg tmp
= dst
;
690 boolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
695 brw_RNDD(p
, tmp
, arg0
);
696 brw_MUL(p
, dst
, tmp
, brw_imm_d(16));
703 /* Will return mangled results for SWZ op. The emit_swz() function
704 * ignores this result and recalculates taking extended swizzles into
707 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
708 struct tgsi_src_register
*src
)
712 if (src
->File
== TGSI_FILE_NULL
)
713 return brw_null_reg();
717 reg
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
->Index
);
720 reg
= get_reg(c
, src
->File
, src
->Index
);
722 /* Convert 3-bit swizzle to 2-bit.
724 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(src
->SwizzleX
,
729 /* Note this is ok for non-swizzle instructions:
731 reg
.negate
= src
->Negate
? 1 : 0;
737 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
738 const struct tgsi_dst_register
*dst
)
740 struct brw_reg reg
= get_reg(c
, dst
->File
, dst
->Index
);
742 reg
.dw1
.bits
.writemask
= dst
->WriteMask
;
750 static void emit_swz( struct brw_vs_compile
*c
,
752 struct tgsi_src_register src
)
754 struct brw_compile
*p
= &c
->func
;
755 unsigned zeros_mask
= 0;
756 unsigned ones_mask
= 0;
757 unsigned src_mask
= 0;
759 boolean need_tmp
= (src
.Negate
&&
760 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
761 struct brw_reg tmp
= dst
;
767 for (i
= 0; i
< 4; i
++) {
768 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
792 case TGSI_EXTSWIZZLE_ZERO
:
795 case TGSI_EXTSWIZZLE_ONE
:
802 /* Do src first, in case dst aliases src:
809 arg0
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
.Index
);
812 arg0
= get_reg(c
, src
.File
, src
.Index
);
814 arg0
= brw_swizzle(arg0
,
815 src_swz
[0], src_swz
[1],
816 src_swz
[2], src_swz
[3]);
818 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
822 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
825 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
828 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
831 brw_MOV(p
, dst
, tmp
);
838 /* Post-vertex-program processing. Send the results to the URB.
840 static void emit_vertex_write( struct brw_vs_compile
*c
, struct brw_prog_info
*info
)
842 struct brw_compile
*p
= &c
->func
;
843 struct brw_reg m0
= brw_message_reg(0);
844 struct brw_reg pos
= c
->regs
[TGSI_FILE_OUTPUT
][info
->pos_idx
];
847 if (c
->key
.copy_edgeflag
) {
849 get_reg(c
, TGSI_FILE_OUTPUT
, info
->result_edge_idx
),
850 get_reg(c
, TGSI_FILE_INPUT
, info
->edge_flag_idx
));
854 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
856 if (!c
->key
.know_w_is_one
) {
858 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
859 brw_MUL(p
, brw_writemask(ndc
, TGSI_WRITEMASK_XYZ
), pos
, ndc
);
865 /* This includes the workaround for -ve rhw, so is no longer an
868 if (info
->writes_psize
||
869 c
->key
.nr_userclip
||
870 !c
->key
.know_w_is_one
)
872 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
875 brw_MOV(p
, header1
, brw_imm_ud(0));
877 brw_set_access_mode(p
, BRW_ALIGN_16
);
879 if (info
->writes_psize
) {
880 struct brw_reg psiz
= c
->regs
[TGSI_FILE_OUTPUT
][info
->psize_idx
];
881 brw_MUL(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
),
882 brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
883 brw_AND(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
,
884 brw_imm_ud(0x7ff<<8));
888 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
889 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
890 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
891 brw_OR(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
892 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
896 /* i965 clipping workaround:
897 * 1) Test for -ve rhw
899 * set ndc = (0,0,0,0)
902 * Later, clipping will detect ucp[6] and ensure the primitive is
903 * clipped against all fixed planes.
905 if (!c
->key
.know_w_is_one
) {
907 vec8(brw_null_reg()),
909 brw_swizzle1(ndc
, 3),
912 brw_OR(p
, brw_writemask(header1
, TGSI_WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
913 brw_MOV(p
, ndc
, brw_imm_f(0));
914 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
917 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
918 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
919 brw_set_access_mode(p
, BRW_ALIGN_16
);
921 release_tmp(c
, header1
);
924 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
928 /* Emit the (interleaved) headers for the two vertices - an 8-reg
929 * of zeros followed by two sets of NDC coordinates:
931 brw_set_access_mode(p
, BRW_ALIGN_1
);
932 brw_MOV(p
, offset(m0
, 2), ndc
);
933 brw_MOV(p
, offset(m0
, 3), pos
);
937 brw_null_reg(), /* dest */
938 0, /* starting mrf reg nr */
942 c
->nr_outputs
+ 3, /* msg len */
943 0, /* response len */
945 1, /* writes complete */
946 0, /* urb destination offset */
947 BRW_URB_SWIZZLE_INTERLEAVE
);
952 post_vs_emit( struct brw_vs_compile
*c
, struct brw_instruction
*end_inst
)
954 struct tgsi_parse_context parse
;
955 const struct tgsi_token
*tokens
= c
->vp
->program
.tokens
;
956 tgsi_parse_init(&parse
, tokens
);
957 while (!tgsi_parse_end_of_tokens(&parse
)) {
958 tgsi_parse_token(&parse
);
959 if (parse
.FullToken
.Token
.Type
== TGSI_TOKEN_TYPE_INSTRUCTION
) {
961 struct brw_instruction
*brw_inst1
, *brw_inst2
;
962 const struct tgsi_full_instruction
*inst1
, *inst2
;
964 inst1
= &parse
.FullToken
.FullInstruction
;
965 brw_inst1
= inst1
->Data
;
966 switch (inst1
->Opcode
) {
967 case TGSI_OPCODE_CAL
:
968 case TGSI_OPCODE_BRA
:
969 target_insn
= inst1
->BranchTarget
;
970 inst2
= &c
->vp
->program
.Base
.Instructions
[target_insn
];
971 brw_inst2
= inst2
->Data
;
972 offset
= brw_inst2
- brw_inst1
;
973 brw_set_src1(brw_inst1
, brw_imm_d(offset
*16));
975 case TGSI_OPCODE_END
:
976 offset
= end_inst
- brw_inst1
;
977 brw_set_src1(brw_inst1
, brw_imm_d(offset
*16));
985 tgsi_parse_free(&parse
);
988 static void process_declaration(const struct tgsi_full_declaration
*decl
,
989 struct brw_prog_info
*info
)
991 int first
= decl
->u
.DeclarationRange
.First
;
992 int last
= decl
->u
.DeclarationRange
.Last
;
994 assert (decl
->Declaration
.Declare
!= TGSI_DECLARE_MASK
);
996 switch(decl
->Declaration
.File
) {
997 case TGSI_FILE_CONSTANT
:
998 info
->num_consts
+= last
- first
+ 1;
1000 case TGSI_FILE_INPUT
: {
1003 case TGSI_FILE_OUTPUT
: {
1004 assert(last
== first
); /* for now */
1005 if (decl
->Declaration
.Semantic
) {
1006 switch (decl
->Semantic
.SemanticName
) {
1007 case TGSI_SEMANTIC_POSITION
: {
1008 info
->pos_idx
= first
;
1011 case TGSI_SEMANTIC_COLOR
:
1013 case TGSI_SEMANTIC_BCOLOR
:
1015 case TGSI_SEMANTIC_FOG
:
1017 case TGSI_SEMANTIC_PSIZE
: {
1018 info
->writes_psize
= TRUE
;
1019 info
->psize_idx
= first
;
1022 case TGSI_SEMANTIC_GENERIC
:
1028 case TGSI_FILE_TEMPORARY
: {
1029 info
->num_temps
+= (last
- first
) + 1;
1032 case TGSI_FILE_SAMPLER
: {
1035 case TGSI_FILE_ADDRESS
: {
1036 info
->num_addrs
+= (last
- first
) + 1;
1039 case TGSI_FILE_IMMEDIATE
: {
1042 case TGSI_FILE_NULL
: {
1048 static void process_instruction(struct brw_vs_compile
*c
,
1049 struct tgsi_full_instruction
*inst
,
1050 struct brw_prog_info
*info
)
1052 struct brw_reg args
[3], dst
;
1053 struct brw_compile
*p
= &c
->func
;
1054 /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
1058 /*FIXME: might not be the only one*/
1059 const struct tgsi_dst_register
*dst_reg
= &inst
->FullDstRegisters
[0].DstRegister
;
1061 struct brw_instruction *if_inst[MAX_IFSN];
1062 unsigned insn, if_insn = 0;
1065 for (i
= 0; i
< 3; i
++) {
1066 struct tgsi_full_src_register
*src
= &inst
->FullSrcRegisters
[i
];
1067 index
= src
->SrcRegister
.Index
;
1068 file
= src
->SrcRegister
.File
;
1069 if (file
== TGSI_FILE_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1070 args
[i
] = c
->output_regs
[index
].reg
;
1072 args
[i
] = get_arg(c
, &src
->SrcRegister
);
1075 /* Get dest regs. Note that it is possible for a reg to be both
1076 * dst and arg, given the static allocation of registers. So
1077 * care needs to be taken emitting multi-operation instructions.
1079 index
= dst_reg
->Index
;
1080 file
= dst_reg
->File
;
1081 if (file
== TGSI_FILE_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1082 dst
= c
->output_regs
[index
].reg
;
1084 dst
= get_dst(c
, dst_reg
);
1086 switch (inst
->Instruction
.Opcode
) {
1087 case TGSI_OPCODE_ABS
:
1088 brw_MOV(p
, dst
, brw_abs(args
[0]));
1090 case TGSI_OPCODE_ADD
:
1091 brw_ADD(p
, dst
, args
[0], args
[1]);
1093 case TGSI_OPCODE_DP3
:
1094 brw_DP3(p
, dst
, args
[0], args
[1]);
1096 case TGSI_OPCODE_DP4
:
1097 brw_DP4(p
, dst
, args
[0], args
[1]);
1099 case TGSI_OPCODE_DPH
:
1100 brw_DPH(p
, dst
, args
[0], args
[1]);
1102 case TGSI_OPCODE_DST
:
1103 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1105 case TGSI_OPCODE_EXP
:
1106 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1108 case TGSI_OPCODE_EX2
:
1109 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1111 case TGSI_OPCODE_ARL
:
1112 emit_arl(c
, dst
, args
[0]);
1114 case TGSI_OPCODE_FLR
:
1115 brw_RNDD(p
, dst
, args
[0]);
1117 case TGSI_OPCODE_FRC
:
1118 brw_FRC(p
, dst
, args
[0]);
1120 case TGSI_OPCODE_LOG
:
1121 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1123 case TGSI_OPCODE_LG2
:
1124 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1126 case TGSI_OPCODE_LIT
:
1127 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1129 case TGSI_OPCODE_MAD
:
1130 brw_MOV(p
, brw_acc_reg(), args
[2]);
1131 brw_MAC(p
, dst
, args
[0], args
[1]);
1133 case TGSI_OPCODE_MAX
:
1134 emit_max(p
, dst
, args
[0], args
[1]);
1136 case TGSI_OPCODE_MIN
:
1137 emit_min(p
, dst
, args
[0], args
[1]);
1139 case TGSI_OPCODE_MOV
:
1141 case TGSI_OPCODE_SWZ
:
1142 /* The args[0] value can't be used here as it won't have
1143 * correctly encoded the full swizzle:
1145 emit_swz(c
, dst
, inst
->SrcReg
[0] );
1147 brw_MOV(p
, dst
, args
[0]);
1149 case TGSI_OPCODE_MUL
:
1150 brw_MUL(p
, dst
, args
[0], args
[1]);
1152 case TGSI_OPCODE_POW
:
1153 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1155 case TGSI_OPCODE_RCP
:
1156 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1158 case TGSI_OPCODE_RSQ
:
1159 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1162 case TGSI_OPCODE_SEQ
:
1163 emit_seq(p
, dst
, args
[0], args
[1]);
1165 case TGSI_OPCODE_SNE
:
1166 emit_sne(p
, dst
, args
[0], args
[1]);
1168 case TGSI_OPCODE_SGE
:
1169 emit_sge(p
, dst
, args
[0], args
[1]);
1171 case TGSI_OPCODE_SGT
:
1172 emit_sgt(p
, dst
, args
[0], args
[1]);
1174 case TGSI_OPCODE_SLT
:
1175 emit_slt(p
, dst
, args
[0], args
[1]);
1177 case TGSI_OPCODE_SLE
:
1178 emit_sle(p
, dst
, args
[0], args
[1]);
1180 case TGSI_OPCODE_SUB
:
1181 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1183 case TGSI_OPCODE_XPD
:
1184 emit_xpd(p
, dst
, args
[0], args
[1]);
1187 case TGSI_OPCODE_IF
:
1188 assert(if_insn
< MAX_IFSN
);
1189 if_inst
[if_insn
++] = brw_IF(p
, BRW_EXECUTE_8
);
1191 case TGSI_OPCODE_ELSE
:
1192 if_inst
[if_insn
-1] = brw_ELSE(p
, if_inst
[if_insn
-1]);
1194 case TGSI_OPCODE_ENDIF
:
1195 assert(if_insn
> 0);
1196 brw_ENDIF(p
, if_inst
[--if_insn
]);
1198 case TGSI_OPCODE_BRA
:
1199 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1200 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1201 brw_set_predicate_control_flag_value(p
, 0xff);
1203 case TGSI_OPCODE_CAL
:
1204 brw_set_access_mode(p
, BRW_ALIGN_1
);
1205 brw_ADD(p
, deref_1uw(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1206 brw_set_access_mode(p
, BRW_ALIGN_16
);
1207 brw_ADD(p
, get_addr_reg(stack_index
),
1208 get_addr_reg(stack_index
), brw_imm_d(4));
1209 inst
->Data
= &p
->store
[p
->nr_insn
];
1210 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1213 case TGSI_OPCODE_RET
:
1215 brw_ADD(p
, get_addr_reg(stack_index
),
1216 get_addr_reg(stack_index
), brw_imm_d(-4));
1217 brw_set_access_mode(p
, BRW_ALIGN_1
);
1218 brw_MOV(p
, brw_ip_reg(), deref_1uw(stack_index
, 0));
1219 brw_set_access_mode(p
, BRW_ALIGN_16
);
1221 /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
1224 case TGSI_OPCODE_END
:
1225 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1227 case TGSI_OPCODE_BGNSUB
:
1228 case TGSI_OPCODE_ENDSUB
:
1231 debug_printf("Unsupport opcode %d in vertex shader\n", inst
->Instruction
.Opcode
);
1235 if (dst_reg
->File
== TGSI_FILE_OUTPUT
1236 && dst_reg
->Index
!= info
->pos_idx
1237 && c
->output_regs
[dst_reg
->Index
].used_in_src
)
1238 brw_MOV(p
, get_dst(c
, dst_reg
), dst
);
1243 /* Emit the fragment program instructions here.
1245 void brw_vs_emit(struct brw_vs_compile
*c
)
1248 struct brw_compile
*p
= &c
->func
;
1249 struct brw_instruction
*end_inst
;
1250 struct tgsi_parse_context parse
;
1251 struct brw_indirect stack_index
= brw_indirect(0, 0);
1252 const struct tgsi_token
*tokens
= c
->vp
->program
.tokens
;
1253 struct brw_prog_info prog_info
;
1254 unsigned allocated_registers
= 0;
1255 memset(&prog_info
, 0, sizeof(struct brw_prog_info
));
1257 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1258 brw_set_access_mode(p
, BRW_ALIGN_16
);
1260 tgsi_parse_init(&parse
, tokens
);
1261 /* Message registers can't be read, so copy the output into GRF register
1262 if they are used in source registers */
1263 while (!tgsi_parse_end_of_tokens(&parse
)) {
1264 tgsi_parse_token(&parse
);
1266 switch (parse
.FullToken
.Token
.Type
) {
1267 case TGSI_TOKEN_TYPE_INSTRUCTION
: {
1268 const struct tgsi_full_instruction
*inst
= &parse
.FullToken
.FullInstruction
;
1269 for (i
= 0; i
< 3; ++i
) {
1270 const struct tgsi_src_register
*src
= &inst
->FullSrcRegisters
[i
].SrcRegister
;
1271 unsigned index
= src
->Index
;
1272 unsigned file
= src
->File
;
1273 if (file
== TGSI_FILE_OUTPUT
)
1274 c
->output_regs
[index
].used_in_src
= TRUE
;
1283 tgsi_parse_free(&parse
);
1285 tgsi_parse_init(&parse
, tokens
);
1287 while (!tgsi_parse_end_of_tokens(&parse
)) {
1288 tgsi_parse_token(&parse
);
1290 switch (parse
.FullToken
.Token
.Type
) {
1291 case TGSI_TOKEN_TYPE_DECLARATION
: {
1292 struct tgsi_full_declaration
*decl
= &parse
.FullToken
.FullDeclaration
;
1293 process_declaration(decl
, &prog_info
);
1296 case TGSI_TOKEN_TYPE_IMMEDIATE
: {
1297 struct tgsi_full_immediate
*imm
= &parse
.FullToken
.FullImmediate
;
1298 /*assert(imm->Immediate.Size == 4);*/
1299 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][0] = imm
->u
.ImmediateFloat32
[0].Float
;
1300 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][1] = imm
->u
.ImmediateFloat32
[1].Float
;
1301 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][2] = imm
->u
.ImmediateFloat32
[2].Float
;
1302 c
->prog_data
.imm_buf
[c
->prog_data
.num_imm
][3] = imm
->u
.ImmediateFloat32
[3].Float
;
1303 c
->prog_data
.num_imm
++;
1306 case TGSI_TOKEN_TYPE_INSTRUCTION
: {
1307 struct tgsi_full_instruction
*inst
= &parse
.FullToken
.FullInstruction
;
1308 if (!allocated_registers
) {
1309 /* first instruction (declerations finished).
1310 * now that we know what vars are being used allocate
1311 * registers for them.*/
1312 c
->prog_data
.num_consts
= prog_info
.num_consts
;
1313 c
->prog_data
.max_const
= prog_info
.num_consts
+ c
->prog_data
.num_imm
;
1314 brw_vs_alloc_regs(c
, &prog_info
);
1316 brw_set_access_mode(p
, BRW_ALIGN_1
);
1317 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1318 brw_set_access_mode(p
, BRW_ALIGN_16
);
1319 allocated_registers
= 1;
1321 process_instruction(c
, inst
, &prog_info
);
1327 end_inst
= &p
->store
[p
->nr_insn
];
1328 emit_vertex_write(c
, &prog_info
);
1329 post_vs_emit(c
, end_inst
);
1330 tgsi_parse_free(&parse
);