2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
42 /* Do things as simply as possible. Allocate and populate all regs
45 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
47 GLuint i
, reg
= 0, mrf
;
50 /* r0 -- reserved as usual
52 c
->r0
= brw_vec8_grf(reg
, 0); reg
++;
54 /* User clip planes from curbe:
56 if (c
->key
.nr_userclip
) {
57 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
58 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
61 /* Deal with curbe alignment:
63 reg
+= ((6+c
->key
.nr_userclip
+3)/4)*2;
66 /* Vertex program parameters from curbe:
68 nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
69 for (i
= 0; i
< nr_params
; i
++) {
70 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
72 reg
+= (nr_params
+1)/2;
74 c
->prog_data
.curb_read_length
= reg
- 1;
78 /* Allocate input regs:
81 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
82 if (c
->prog_data
.inputs_read
& (1<<i
)) {
84 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
90 /* Allocate outputs: TODO: could organize the non-position outputs
91 * to go straight into message regs.
94 c
->first_output
= reg
;
96 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
97 if (c
->prog_data
.outputs_written
& (1<<i
)) {
99 if (i
== VERT_RESULT_HPOS
) {
100 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
103 else if (i
== VERT_RESULT_PSIZ
) {
104 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
106 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
109 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
115 /* Allocate program temporaries:
117 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
118 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
122 /* Address reg(s). Don't try to use the internal address reg until
125 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
126 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
130 BRW_VERTICAL_STRIDE_8
,
132 BRW_HORIZONTAL_STRIDE_1
,
139 /* Some opcodes need an internal temporary:
142 c
->last_tmp
= reg
; /* for allocation purposes */
144 /* Each input reg holds data from two vertices. The
145 * urb_read_length is the number of registers read from *each*
146 * vertex urb, so is half the amount:
148 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+1)/2;
150 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+2+3)/4;
151 c
->prog_data
.total_grf
= reg
;
155 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
157 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
159 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
160 c
->prog_data
.total_grf
= c
->last_tmp
;
165 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
167 if (tmp
.nr
== c
->last_tmp
-1)
171 static void release_tmps( struct brw_vs_compile
*c
)
173 c
->last_tmp
= c
->first_tmp
;
177 static void unalias1( struct brw_vs_compile
*c
,
180 void (*func
)( struct brw_vs_compile
*,
184 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
185 struct brw_compile
*p
= &c
->func
;
186 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
188 brw_MOV(p
, dst
, tmp
);
195 static void unalias2( struct brw_vs_compile
*c
,
199 void (*func
)( struct brw_vs_compile
*,
204 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) &&
205 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
206 struct brw_compile
*p
= &c
->func
;
207 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
208 func(c
, tmp
, arg0
, arg1
);
209 brw_MOV(p
, dst
, tmp
);
212 func(c
, dst
, arg0
, arg1
);
219 static void emit_slt( struct brw_compile
*p
,
222 struct brw_reg arg1
)
224 /* Could be done with an if/else/endif, but this method uses half
225 * the instructions. Note that we are careful to reference the
226 * arguments before writing the dest. That means we emit the
227 * instructions in an odd order and have to play with the flag
230 brw_push_insn_state(p
);
231 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
233 /* Write all values to 1:
235 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
236 brw_MOV(p
, dst
, brw_imm_f(1.0));
238 /* Where the test succeeded, overwite with zero:
240 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
241 brw_MOV(p
, dst
, brw_imm_f(0.0));
242 brw_pop_insn_state(p
);
246 static void emit_sge( struct brw_compile
*p
,
249 struct brw_reg arg1
)
251 brw_push_insn_state(p
);
252 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_GE
, arg0
, arg1
);
254 /* Write all values to zero:
256 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
257 brw_MOV(p
, dst
, brw_imm_f(0));
259 /* Where the test succeeded, overwite with 1:
261 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
262 brw_MOV(p
, dst
, brw_imm_f(1.0));
263 brw_pop_insn_state(p
);
267 static void emit_max( struct brw_compile
*p
,
270 struct brw_reg arg1
)
272 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
273 brw_SEL(p
, dst
, arg1
, arg0
);
274 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
277 static void emit_min( struct brw_compile
*p
,
280 struct brw_reg arg1
)
282 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
283 brw_SEL(p
, dst
, arg0
, arg1
);
284 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
288 static void emit_math1( struct brw_vs_compile
*c
,
294 /* There are various odd behaviours with SEND on the simulator. In
295 * addition there are documented issues with the fact that the GEN4
296 * processor doesn't do dependency control properly on SEND
297 * results. So, on balance, this kludge to get around failures
298 * with writemasked math results looks like it might be necessary
299 * whether that turns out to be a simulator bug or not:
301 struct brw_compile
*p
= &c
->func
;
302 struct brw_reg tmp
= dst
;
303 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
304 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
312 BRW_MATH_SATURATE_NONE
,
315 BRW_MATH_DATA_SCALAR
,
319 brw_MOV(p
, dst
, tmp
);
324 static void emit_math2( struct brw_vs_compile
*c
,
331 struct brw_compile
*p
= &c
->func
;
332 struct brw_reg tmp
= dst
;
333 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
334 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
339 brw_MOV(p
, brw_message_reg(3), arg1
);
344 BRW_MATH_SATURATE_NONE
,
347 BRW_MATH_DATA_SCALAR
,
351 brw_MOV(p
, dst
, tmp
);
358 static void emit_exp_noalias( struct brw_vs_compile
*c
,
360 struct brw_reg arg0
)
362 struct brw_compile
*p
= &c
->func
;
365 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
366 struct brw_reg tmp
= get_tmp(c
);
367 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
369 /* tmp_d = floor(arg0.x) */
370 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
372 /* result[0] = 2.0 ^ tmp */
374 /* Adjust exponent for floating point:
377 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
379 /* Install exponent and sign.
380 * Excess drops off the edge:
382 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
383 tmp_d
, brw_imm_d(23));
388 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
389 /* result[1] = arg0.x - floor(arg0.x) */
390 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
393 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
394 /* As with the LOG instruction, we might be better off just
395 * doing a taylor expansion here, seeing as we have to do all
398 * If mathbox partial precision is too low, consider also:
399 * result[3] = result[0] * EXP(result[1])
402 BRW_MATH_FUNCTION_EXP
,
403 brw_writemask(dst
, WRITEMASK_Z
),
404 brw_swizzle1(arg0
, 0),
405 BRW_MATH_PRECISION_PARTIAL
);
408 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
409 /* result[3] = 1.0; */
410 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
415 static void emit_log_noalias( struct brw_vs_compile
*c
,
417 struct brw_reg arg0
)
419 struct brw_compile
*p
= &c
->func
;
420 struct brw_reg tmp
= dst
;
421 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
422 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
423 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
424 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
428 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
431 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
434 * These almost look likey they could be joined up, but not really
437 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
438 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
440 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
442 brw_writemask(tmp_ud
, WRITEMASK_X
),
443 brw_swizzle1(arg0_ud
, 0),
444 brw_imm_ud((1U<<31)-1));
447 brw_writemask(tmp_ud
, WRITEMASK_X
),
452 brw_writemask(tmp
, WRITEMASK_X
),
453 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
457 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
459 brw_writemask(tmp_ud
, WRITEMASK_Y
),
460 brw_swizzle1(arg0_ud
, 0),
461 brw_imm_ud((1<<23)-1));
464 brw_writemask(tmp_ud
, WRITEMASK_Y
),
466 brw_imm_ud(127<<23));
469 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
470 /* result[2] = result[0] + LOG2(result[1]); */
472 /* Why bother? The above is just a hint how to do this with a
473 * taylor series. Maybe we *should* use a taylor series as by
474 * the time all the above has been done it's almost certainly
475 * quicker than calling the mathbox, even with low precision.
478 * - result[0] + mathbox.LOG2(result[1])
479 * - mathbox.LOG2(arg0.x)
480 * - result[0] + inline_taylor_approx(result[1])
483 BRW_MATH_FUNCTION_LOG
,
484 brw_writemask(tmp
, WRITEMASK_Z
),
485 brw_swizzle1(tmp
, 1),
486 BRW_MATH_PRECISION_FULL
);
489 brw_writemask(tmp
, WRITEMASK_Z
),
490 brw_swizzle1(tmp
, 2),
491 brw_swizzle1(tmp
, 0));
494 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
495 /* result[3] = 1.0; */
496 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
500 brw_MOV(p
, dst
, tmp
);
508 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
510 static void emit_dst_noalias( struct brw_vs_compile
*c
,
515 struct brw_compile
*p
= &c
->func
;
517 /* There must be a better way to do this:
519 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
520 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
521 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
522 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
523 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
524 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
525 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
526 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
529 static void emit_xpd( struct brw_compile
*p
,
534 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
535 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
540 static void emit_lit_noalias( struct brw_vs_compile
*c
,
542 struct brw_reg arg0
)
544 struct brw_compile
*p
= &c
->func
;
545 struct brw_instruction
*if_insn
;
546 struct brw_reg tmp
= dst
;
547 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
552 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
553 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
555 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
556 * to get all channels active inside the IF. In the clipping code
557 * we run with NoMask, so it's not an option and we can use
558 * BRW_EXECUTE_1 for all comparisions.
560 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
561 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
563 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
565 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
566 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
567 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
570 BRW_MATH_FUNCTION_POW
,
571 brw_writemask(dst
, WRITEMASK_Z
),
572 brw_swizzle1(tmp
, 2),
573 brw_swizzle1(arg0
, 3),
574 BRW_MATH_PRECISION_PARTIAL
);
577 brw_ENDIF(p
, if_insn
);
584 /* TODO: relative addressing!
586 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
592 case PROGRAM_TEMPORARY
:
595 case PROGRAM_STATE_VAR
:
596 assert(c
->regs
[file
][index
].nr
!= 0);
597 return c
->regs
[file
][index
];
598 case PROGRAM_ADDRESS
:
600 return c
->regs
[file
][index
];
602 case PROGRAM_UNDEFINED
: /* undef values */
603 return brw_null_reg();
605 case PROGRAM_LOCAL_PARAM
:
606 case PROGRAM_ENV_PARAM
:
607 case PROGRAM_WRITE_ONLY
:
610 return brw_null_reg();
616 static struct brw_reg
deref( struct brw_vs_compile
*c
,
620 struct brw_compile
*p
= &c
->func
;
621 struct brw_reg tmp
= vec4(get_tmp(c
));
622 struct brw_reg vp_address
= retype(vec1(get_reg(c
, PROGRAM_ADDRESS
, 0)), BRW_REGISTER_TYPE_UW
);
623 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
624 struct brw_reg indirect
= brw_vec4_indirect(0,0);
627 brw_push_insn_state(p
);
628 brw_set_access_mode(p
, BRW_ALIGN_1
);
630 /* This is pretty clunky - load the address register twice and
631 * fetch each 4-dword value in turn. There must be a way to do
632 * this in a single pass, but I couldn't get it to work.
634 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
635 brw_MOV(p
, tmp
, indirect
);
637 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
638 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
640 brw_pop_insn_state(p
);
647 static void emit_arl( struct brw_vs_compile
*c
,
649 struct brw_reg arg0
)
651 struct brw_compile
*p
= &c
->func
;
652 struct brw_reg tmp
= dst
;
653 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
658 brw_RNDD(p
, tmp
, arg0
);
659 brw_MUL(p
, dst
, tmp
, brw_imm_d(16));
666 /* Will return mangled results for SWZ op. The emit_swz() function
667 * ignores this result and recalculates taking extended swizzles into
670 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
671 struct prog_src_register src
)
675 if (src
.File
== PROGRAM_UNDEFINED
)
676 return brw_null_reg();
679 reg
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
.Index
);
681 reg
= get_reg(c
, src
.File
, src
.Index
);
683 /* Convert 3-bit swizzle to 2-bit.
685 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
.Swizzle
, 0),
686 GET_SWZ(src
.Swizzle
, 1),
687 GET_SWZ(src
.Swizzle
, 2),
688 GET_SWZ(src
.Swizzle
, 3));
690 /* Note this is ok for non-swizzle instructions:
692 reg
.negate
= src
.NegateBase
? 1 : 0;
698 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
699 struct prog_dst_register dst
)
701 struct brw_reg reg
= get_reg(c
, dst
.File
, dst
.Index
);
703 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
711 static void emit_swz( struct brw_vs_compile
*c
,
713 struct prog_src_register src
)
715 struct brw_compile
*p
= &c
->func
;
716 GLuint zeros_mask
= 0;
717 GLuint ones_mask
= 0;
720 GLboolean need_tmp
= (src
.NegateBase
&&
721 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
722 struct brw_reg tmp
= dst
;
728 for (i
= 0; i
< 4; i
++) {
729 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
730 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
749 /* Do src first, in case dst aliases src:
755 arg0
= deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], src
.Index
);
757 arg0
= get_reg(c
, src
.File
, src
.Index
);
759 arg0
= brw_swizzle(arg0
,
760 src_swz
[0], src_swz
[1],
761 src_swz
[2], src_swz
[3]);
763 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
767 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
770 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
773 brw_MOV(p
, brw_writemask(tmp
, src
.NegateBase
), negate(tmp
));
776 brw_MOV(p
, dst
, tmp
);
783 /* Post-vertex-program processing. Send the results to the URB.
785 static void emit_vertex_write( struct brw_vs_compile
*c
)
787 struct brw_compile
*p
= &c
->func
;
788 struct brw_reg m0
= brw_message_reg(0);
789 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
792 if (c
->key
.copy_edgeflag
) {
794 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
795 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
799 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
801 if (!c
->key
.know_w_is_one
) {
803 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
804 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
810 /* This includes the workaround for -ve rhw, so is no longer an
813 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
814 c
->key
.nr_userclip
||
815 !c
->key
.know_w_is_one
)
817 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
820 brw_MOV(p
, header1
, brw_imm_ud(0));
822 brw_set_access_mode(p
, BRW_ALIGN_16
);
824 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
825 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
826 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
827 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
831 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
832 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
833 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
834 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
835 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
839 /* i965 clipping workaround:
840 * 1) Test for -ve rhw
842 * set ndc = (0,0,0,0)
845 * Later, clipping will detect ucp[6] and ensure the primitive is
846 * clipped against all fixed planes.
848 if (!c
->key
.know_w_is_one
) {
850 vec8(brw_null_reg()),
852 brw_swizzle1(ndc
, 3),
855 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
856 brw_MOV(p
, ndc
, brw_imm_f(0));
857 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
860 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
861 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
862 brw_set_access_mode(p
, BRW_ALIGN_16
);
864 release_tmp(c
, header1
);
867 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
871 /* Emit the (interleaved) headers for the two vertices - an 8-reg
872 * of zeros followed by two sets of NDC coordinates:
874 brw_set_access_mode(p
, BRW_ALIGN_1
);
875 brw_MOV(p
, offset(m0
, 2), ndc
);
876 brw_MOV(p
, offset(m0
, 3), pos
);
880 brw_null_reg(), /* dest */
881 0, /* starting mrf reg nr */
885 c
->nr_outputs
+ 3, /* msg len */
886 0, /* response len */
888 1, /* writes complete */
889 0, /* urb destination offset */
890 BRW_URB_SWIZZLE_INTERLEAVE
);
897 /* Emit the fragment program instructions here.
899 void brw_vs_emit( struct brw_vs_compile
*c
)
901 struct brw_compile
*p
= &c
->func
;
902 GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
906 if (INTEL_DEBUG
& DEBUG_VS
) {
907 _mesa_printf("\n\n\nvs-emit:\n");
908 _mesa_print_program(&c
->vp
->program
.Base
);
912 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
913 brw_set_access_mode(p
, BRW_ALIGN_16
);
915 /* Static register allocation
917 brw_vs_alloc_regs(c
);
919 for (insn
= 0; insn
< nr_insns
; insn
++) {
921 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
922 struct brw_reg args
[3], dst
;
925 /* Get argument regs. SWZ is special and does this itself.
927 if (inst
->Opcode
!= OPCODE_SWZ
)
928 for (i
= 0; i
< 3; i
++)
929 args
[i
] = get_arg(c
, inst
->SrcReg
[i
]);
931 /* Get dest regs. Note that it is possible for a reg to be both
932 * dst and arg, given the static allocation of registers. So
933 * care needs to be taken emitting multi-operation instructions.
935 dst
= get_dst(c
, inst
->DstReg
);
938 switch (inst
->Opcode
) {
940 brw_MOV(p
, dst
, brw_abs(args
[0]));
943 brw_ADD(p
, dst
, args
[0], args
[1]);
946 brw_DP3(p
, dst
, args
[0], args
[1]);
949 brw_DP4(p
, dst
, args
[0], args
[1]);
952 brw_DPH(p
, dst
, args
[0], args
[1]);
955 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
958 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
961 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
964 emit_arl(c
, dst
, args
[0]);
967 brw_RNDD(p
, dst
, args
[0]);
970 brw_FRC(p
, dst
, args
[0]);
973 unalias1(c
, dst
, args
[0], emit_log_noalias
);
976 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
979 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
982 brw_MOV(p
, brw_acc_reg(), args
[2]);
983 brw_MAC(p
, dst
, args
[0], args
[1]);
986 emit_max(p
, dst
, args
[0], args
[1]);
989 emit_min(p
, dst
, args
[0], args
[1]);
992 brw_MOV(p
, dst
, args
[0]);
995 brw_MUL(p
, dst
, args
[0], args
[1]);
998 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1001 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1004 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1007 emit_sge(p
, dst
, args
[0], args
[1]);
1010 emit_slt(p
, dst
, args
[0], args
[1]);
1013 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1016 /* The args[0] value can't be used here as it won't have
1017 * correctly encoded the full swizzle:
1019 emit_swz(c
, dst
, inst
->SrcReg
[0] );
1022 emit_xpd(p
, dst
, args
[0], args
[1]);
1034 emit_vertex_write(c
);