2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **********************************************************************/
29 * Keith Whitwell <keith@tungstengraphics.com>
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
41 static struct brw_reg
get_tmp( struct brw_vs_compile
*c
)
43 struct brw_reg tmp
= brw_vec8_grf(c
->last_tmp
, 0);
45 if (++c
->last_tmp
> c
->prog_data
.total_grf
)
46 c
->prog_data
.total_grf
= c
->last_tmp
;
51 static void release_tmp( struct brw_vs_compile
*c
, struct brw_reg tmp
)
53 if (tmp
.nr
== c
->last_tmp
-1)
57 static void release_tmps( struct brw_vs_compile
*c
)
59 c
->last_tmp
= c
->first_tmp
;
64 * Preallocate GRF register before code emit.
65 * Do things as simply as possible. Allocate and populate all regs
68 static void brw_vs_alloc_regs( struct brw_vs_compile
*c
)
70 GLuint i
, reg
= 0, mrf
;
72 /* Determine whether to use a real constant buffer or use a block
73 * of GRF registers for constants. The later is faster but only
74 * works if everything fits in the GRF.
75 * XXX this heuristic/check may need some fine tuning...
77 if (c
->vp
->program
.Base
.Parameters
->NumParameters
+
78 c
->vp
->program
.Base
.NumTemporaries
+ 20 > BRW_MAX_GRF
)
79 c
->vp
->use_const_buffer
= GL_TRUE
;
81 c
->vp
->use_const_buffer
= GL_FALSE
;
83 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
85 /* r0 -- reserved as usual
87 c
->r0
= brw_vec8_grf(reg
, 0);
90 /* User clip planes from curbe:
92 if (c
->key
.nr_userclip
) {
93 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
94 c
->userplane
[i
] = stride( brw_vec4_grf(reg
+3+i
/2, (i
%2) * 4), 0, 4, 1);
97 /* Deal with curbe alignment:
99 reg
+= ((6 + c
->key
.nr_userclip
+ 3) / 4) * 2;
102 /* Vertex program parameters from curbe:
104 if (c
->vp
->use_const_buffer
) {
105 /* get constants from a real constant buffer */
106 c
->prog_data
.curb_read_length
= 0;
107 c
->prog_data
.nr_params
= 4; /* XXX 0 causes a bug elsewhere... */
110 /* use a section of the GRF for constants */
111 GLuint nr_params
= c
->vp
->program
.Base
.Parameters
->NumParameters
;
112 for (i
= 0; i
< nr_params
; i
++) {
113 c
->regs
[PROGRAM_STATE_VAR
][i
] = stride( brw_vec4_grf(reg
+i
/2, (i
%2) * 4), 0, 4, 1);
115 reg
+= (nr_params
+ 1) / 2;
116 c
->prog_data
.curb_read_length
= reg
- 1;
118 c
->prog_data
.nr_params
= nr_params
* 4;
121 /* Allocate input regs:
124 for (i
= 0; i
< VERT_ATTRIB_MAX
; i
++) {
125 if (c
->prog_data
.inputs_read
& (1 << i
)) {
127 c
->regs
[PROGRAM_INPUT
][i
] = brw_vec8_grf(reg
, 0);
132 /* Allocate outputs. The non-position outputs go straight into message regs.
135 c
->first_output
= reg
;
136 c
->first_overflow_output
= 0;
138 for (i
= 0; i
< VERT_RESULT_MAX
; i
++) {
139 if (c
->prog_data
.outputs_written
& (1 << i
)) {
141 assert(i
< Elements(c
->regs
[PROGRAM_OUTPUT
]));
142 if (i
== VERT_RESULT_HPOS
) {
143 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
146 else if (i
== VERT_RESULT_PSIZ
) {
147 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
149 mrf
++; /* just a placeholder? XXX fix later stages & remove this */
153 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_message_reg(mrf
);
157 /* too many vertex results to fit in MRF, use GRF for overflow */
158 if (!c
->first_overflow_output
)
159 c
->first_overflow_output
= i
;
160 c
->regs
[PROGRAM_OUTPUT
][i
] = brw_vec8_grf(reg
, 0);
167 /* Allocate program temporaries:
169 for (i
= 0; i
< c
->vp
->program
.Base
.NumTemporaries
; i
++) {
170 c
->regs
[PROGRAM_TEMPORARY
][i
] = brw_vec8_grf(reg
, 0);
174 /* Address reg(s). Don't try to use the internal address reg until
177 for (i
= 0; i
< c
->vp
->program
.Base
.NumAddressRegs
; i
++) {
178 c
->regs
[PROGRAM_ADDRESS
][i
] = brw_reg(BRW_GENERAL_REGISTER_FILE
,
182 BRW_VERTICAL_STRIDE_8
,
184 BRW_HORIZONTAL_STRIDE_1
,
190 if (c
->vp
->use_const_buffer
) {
191 for (i
= 0; i
< 3; i
++) {
192 c
->current_const
[i
].index
= -1;
193 c
->current_const
[i
].reg
= brw_vec8_grf(reg
, 0);
198 for (i
= 0; i
< 128; i
++) {
199 if (c
->output_regs
[i
].used_in_src
) {
200 c
->output_regs
[i
].reg
= brw_vec8_grf(reg
, 0);
205 c
->stack
= brw_uw16_reg(BRW_GENERAL_REGISTER_FILE
, reg
, 0);
208 /* Some opcodes need an internal temporary:
211 c
->last_tmp
= reg
; /* for allocation purposes */
213 /* Each input reg holds data from two vertices. The
214 * urb_read_length is the number of registers read from *each*
215 * vertex urb, so is half the amount:
217 c
->prog_data
.urb_read_length
= (c
->nr_inputs
+ 1) / 2;
219 c
->prog_data
.urb_entry_size
= (c
->nr_outputs
+ 2 + 3) / 4;
220 c
->prog_data
.total_grf
= reg
;
222 if (INTEL_DEBUG
& DEBUG_VS
) {
223 _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumAddressRegs
);
224 _mesa_printf("%s NumTemps %d\n", __FUNCTION__
, c
->vp
->program
.Base
.NumTemporaries
);
225 _mesa_printf("%s reg = %d\n", __FUNCTION__
, reg
);
231 * If an instruction uses a temp reg both as a src and the dest, we
232 * sometimes need to allocate an intermediate temporary.
234 static void unalias1( struct brw_vs_compile
*c
,
237 void (*func
)( struct brw_vs_compile
*,
241 if (dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) {
242 struct brw_compile
*p
= &c
->func
;
243 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
245 brw_MOV(p
, dst
, tmp
);
255 * Checkes if 2-operand instruction needs an intermediate temporary.
257 static void unalias2( struct brw_vs_compile
*c
,
261 void (*func
)( struct brw_vs_compile
*,
266 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
267 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
)) {
268 struct brw_compile
*p
= &c
->func
;
269 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
270 func(c
, tmp
, arg0
, arg1
);
271 brw_MOV(p
, dst
, tmp
);
275 func(c
, dst
, arg0
, arg1
);
281 * Checkes if 3-operand instruction needs an intermediate temporary.
283 static void unalias3( struct brw_vs_compile
*c
,
288 void (*func
)( struct brw_vs_compile
*,
294 if ((dst
.file
== arg0
.file
&& dst
.nr
== arg0
.nr
) ||
295 (dst
.file
== arg1
.file
&& dst
.nr
== arg1
.nr
) ||
296 (dst
.file
== arg2
.file
&& dst
.nr
== arg2
.nr
)) {
297 struct brw_compile
*p
= &c
->func
;
298 struct brw_reg tmp
= brw_writemask(get_tmp(c
), dst
.dw1
.bits
.writemask
);
299 func(c
, tmp
, arg0
, arg1
, arg2
);
300 brw_MOV(p
, dst
, tmp
);
304 func(c
, dst
, arg0
, arg1
, arg2
);
308 static void emit_sop( struct brw_compile
*p
,
314 brw_MOV(p
, dst
, brw_imm_f(0.0f
));
315 brw_CMP(p
, brw_null_reg(), cond
, arg0
, arg1
);
316 brw_MOV(p
, dst
, brw_imm_f(1.0f
));
317 brw_set_predicate_control_flag_value(p
, 0xff);
320 static void emit_seq( struct brw_compile
*p
,
323 struct brw_reg arg1
)
325 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_EQ
);
328 static void emit_sne( struct brw_compile
*p
,
331 struct brw_reg arg1
)
333 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_NEQ
);
335 static void emit_slt( struct brw_compile
*p
,
338 struct brw_reg arg1
)
340 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_L
);
343 static void emit_sle( struct brw_compile
*p
,
346 struct brw_reg arg1
)
348 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_LE
);
351 static void emit_sgt( struct brw_compile
*p
,
354 struct brw_reg arg1
)
356 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_G
);
359 static void emit_sge( struct brw_compile
*p
,
362 struct brw_reg arg1
)
364 emit_sop(p
, dst
, arg0
, arg1
, BRW_CONDITIONAL_GE
);
367 static void emit_max( struct brw_compile
*p
,
370 struct brw_reg arg1
)
372 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
373 brw_SEL(p
, dst
, arg1
, arg0
);
374 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
377 static void emit_min( struct brw_compile
*p
,
380 struct brw_reg arg1
)
382 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_L
, arg0
, arg1
);
383 brw_SEL(p
, dst
, arg0
, arg1
);
384 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
388 static void emit_math1( struct brw_vs_compile
*c
,
394 /* There are various odd behaviours with SEND on the simulator. In
395 * addition there are documented issues with the fact that the GEN4
396 * processor doesn't do dependency control properly on SEND
397 * results. So, on balance, this kludge to get around failures
398 * with writemasked math results looks like it might be necessary
399 * whether that turns out to be a simulator bug or not:
401 struct brw_compile
*p
= &c
->func
;
402 struct brw_reg tmp
= dst
;
403 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
404 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
412 BRW_MATH_SATURATE_NONE
,
415 BRW_MATH_DATA_SCALAR
,
419 brw_MOV(p
, dst
, tmp
);
425 static void emit_math2( struct brw_vs_compile
*c
,
432 struct brw_compile
*p
= &c
->func
;
433 struct brw_reg tmp
= dst
;
434 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
435 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
440 brw_MOV(p
, brw_message_reg(3), arg1
);
445 BRW_MATH_SATURATE_NONE
,
448 BRW_MATH_DATA_SCALAR
,
452 brw_MOV(p
, dst
, tmp
);
458 static void emit_exp_noalias( struct brw_vs_compile
*c
,
460 struct brw_reg arg0
)
462 struct brw_compile
*p
= &c
->func
;
465 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
) {
466 struct brw_reg tmp
= get_tmp(c
);
467 struct brw_reg tmp_d
= retype(tmp
, BRW_REGISTER_TYPE_D
);
469 /* tmp_d = floor(arg0.x) */
470 brw_RNDD(p
, tmp_d
, brw_swizzle1(arg0
, 0));
472 /* result[0] = 2.0 ^ tmp */
474 /* Adjust exponent for floating point:
477 brw_ADD(p
, brw_writemask(tmp_d
, WRITEMASK_X
), tmp_d
, brw_imm_d(127));
479 /* Install exponent and sign.
480 * Excess drops off the edge:
482 brw_SHL(p
, brw_writemask(retype(dst
, BRW_REGISTER_TYPE_D
), WRITEMASK_X
),
483 tmp_d
, brw_imm_d(23));
488 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
) {
489 /* result[1] = arg0.x - floor(arg0.x) */
490 brw_FRC(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
, 0));
493 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
494 /* As with the LOG instruction, we might be better off just
495 * doing a taylor expansion here, seeing as we have to do all
498 * If mathbox partial precision is too low, consider also:
499 * result[3] = result[0] * EXP(result[1])
502 BRW_MATH_FUNCTION_EXP
,
503 brw_writemask(dst
, WRITEMASK_Z
),
504 brw_swizzle1(arg0
, 0),
505 BRW_MATH_PRECISION_FULL
);
508 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
509 /* result[3] = 1.0; */
510 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), brw_imm_f(1));
515 static void emit_log_noalias( struct brw_vs_compile
*c
,
517 struct brw_reg arg0
)
519 struct brw_compile
*p
= &c
->func
;
520 struct brw_reg tmp
= dst
;
521 struct brw_reg tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
522 struct brw_reg arg0_ud
= retype(arg0
, BRW_REGISTER_TYPE_UD
);
523 GLboolean need_tmp
= (dst
.dw1
.bits
.writemask
!= 0xf ||
524 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
528 tmp_ud
= retype(tmp
, BRW_REGISTER_TYPE_UD
);
531 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
534 * These almost look likey they could be joined up, but not really
537 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
538 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
540 if (dst
.dw1
.bits
.writemask
& WRITEMASK_XZ
) {
542 brw_writemask(tmp_ud
, WRITEMASK_X
),
543 brw_swizzle1(arg0_ud
, 0),
544 brw_imm_ud((1U<<31)-1));
547 brw_writemask(tmp_ud
, WRITEMASK_X
),
552 brw_writemask(tmp
, WRITEMASK_X
),
553 retype(tmp_ud
, BRW_REGISTER_TYPE_D
), /* does it matter? */
557 if (dst
.dw1
.bits
.writemask
& WRITEMASK_YZ
) {
559 brw_writemask(tmp_ud
, WRITEMASK_Y
),
560 brw_swizzle1(arg0_ud
, 0),
561 brw_imm_ud((1<<23)-1));
564 brw_writemask(tmp_ud
, WRITEMASK_Y
),
566 brw_imm_ud(127<<23));
569 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
) {
570 /* result[2] = result[0] + LOG2(result[1]); */
572 /* Why bother? The above is just a hint how to do this with a
573 * taylor series. Maybe we *should* use a taylor series as by
574 * the time all the above has been done it's almost certainly
575 * quicker than calling the mathbox, even with low precision.
578 * - result[0] + mathbox.LOG2(result[1])
579 * - mathbox.LOG2(arg0.x)
580 * - result[0] + inline_taylor_approx(result[1])
583 BRW_MATH_FUNCTION_LOG
,
584 brw_writemask(tmp
, WRITEMASK_Z
),
585 brw_swizzle1(tmp
, 1),
586 BRW_MATH_PRECISION_FULL
);
589 brw_writemask(tmp
, WRITEMASK_Z
),
590 brw_swizzle1(tmp
, 2),
591 brw_swizzle1(tmp
, 0));
594 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
) {
595 /* result[3] = 1.0; */
596 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_W
), brw_imm_f(1));
600 brw_MOV(p
, dst
, tmp
);
606 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
608 static void emit_dst_noalias( struct brw_vs_compile
*c
,
613 struct brw_compile
*p
= &c
->func
;
615 /* There must be a better way to do this:
617 if (dst
.dw1
.bits
.writemask
& WRITEMASK_X
)
618 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_X
), brw_imm_f(1.0));
619 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Y
)
620 brw_MUL(p
, brw_writemask(dst
, WRITEMASK_Y
), arg0
, arg1
);
621 if (dst
.dw1
.bits
.writemask
& WRITEMASK_Z
)
622 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Z
), arg0
);
623 if (dst
.dw1
.bits
.writemask
& WRITEMASK_W
)
624 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_W
), arg1
);
628 static void emit_xpd( struct brw_compile
*p
,
633 brw_MUL(p
, brw_null_reg(), brw_swizzle(t
, 1,2,0,3), brw_swizzle(u
,2,0,1,3));
634 brw_MAC(p
, dst
, negate(brw_swizzle(t
, 2,0,1,3)), brw_swizzle(u
,1,2,0,3));
638 static void emit_lit_noalias( struct brw_vs_compile
*c
,
640 struct brw_reg arg0
)
642 struct brw_compile
*p
= &c
->func
;
643 struct brw_instruction
*if_insn
;
644 struct brw_reg tmp
= dst
;
645 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
650 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_YZ
), brw_imm_f(0));
651 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_XW
), brw_imm_f(1));
653 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
654 * to get all channels active inside the IF. In the clipping code
655 * we run with NoMask, so it's not an option and we can use
656 * BRW_EXECUTE_1 for all comparisions.
658 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,0), brw_imm_f(0));
659 if_insn
= brw_IF(p
, BRW_EXECUTE_8
);
661 brw_MOV(p
, brw_writemask(dst
, WRITEMASK_Y
), brw_swizzle1(arg0
,0));
663 brw_CMP(p
, brw_null_reg(), BRW_CONDITIONAL_G
, brw_swizzle1(arg0
,1), brw_imm_f(0));
664 brw_MOV(p
, brw_writemask(tmp
, WRITEMASK_Z
), brw_swizzle1(arg0
,1));
665 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
668 BRW_MATH_FUNCTION_POW
,
669 brw_writemask(dst
, WRITEMASK_Z
),
670 brw_swizzle1(tmp
, 2),
671 brw_swizzle1(arg0
, 3),
672 BRW_MATH_PRECISION_PARTIAL
);
675 brw_ENDIF(p
, if_insn
);
680 static void emit_lrp_noalias(struct brw_vs_compile
*c
,
686 struct brw_compile
*p
= &c
->func
;
688 brw_ADD(p
, dst
, negate(arg0
), brw_imm_f(1.0));
689 brw_MUL(p
, brw_null_reg(), dst
, arg2
);
690 brw_MAC(p
, dst
, arg0
, arg1
);
693 /** 3 or 4-component vector normalization */
694 static void emit_nrm( struct brw_vs_compile
*c
,
699 struct brw_compile
*p
= &c
->func
;
700 struct brw_reg tmp
= get_tmp(c
);
702 /* tmp = dot(arg0, arg0) */
704 brw_DP3(p
, tmp
, arg0
, arg0
);
706 brw_DP4(p
, tmp
, arg0
, arg0
);
708 /* tmp = 1 / sqrt(tmp) */
709 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, tmp
, tmp
, BRW_MATH_PRECISION_FULL
);
711 /* dst = arg0 * tmp */
712 brw_MUL(p
, dst
, arg0
, tmp
);
718 static struct brw_reg
719 get_constant(struct brw_vs_compile
*c
,
720 const struct prog_instruction
*inst
,
723 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
724 struct brw_compile
*p
= &c
->func
;
725 struct brw_reg const_reg
;
726 struct brw_reg const2_reg
;
727 const GLboolean relAddr
= src
->RelAddr
;
729 assert(argIndex
< 3);
731 if (c
->current_const
[argIndex
].index
!= src
->Index
|| relAddr
) {
732 struct brw_reg addrReg
= c
->regs
[PROGRAM_ADDRESS
][0];
734 c
->current_const
[argIndex
].index
= src
->Index
;
737 printf(" fetch const[%d] for arg %d into reg %d\n",
738 src
->Index
, argIndex
, c
->current_const
[argIndex
].reg
.nr
);
740 /* need to fetch the constant now */
742 c
->current_const
[argIndex
].reg
,/* writeback dest */
744 relAddr
, /* relative indexing? */
745 addrReg
, /* address register */
746 16 * src
->Index
, /* byte offset */
747 SURF_INDEX_VERT_CONST_BUFFER
/* binding table index */
752 const2_reg
= get_tmp(c
);
754 /* use upper half of address reg for second read */
755 addrReg
= stride(addrReg
, 0, 4, 0);
759 const2_reg
, /* writeback dest */
761 relAddr
, /* relative indexing? */
762 addrReg
, /* address register */
763 16 * src
->Index
, /* byte offset */
764 SURF_INDEX_VERT_CONST_BUFFER
769 const_reg
= c
->current_const
[argIndex
].reg
;
772 /* merge the two Owords into the constant register */
773 /* const_reg[7..4] = const2_reg[7..4] */
775 suboffset(stride(const_reg
, 0, 4, 1), 4),
776 suboffset(stride(const2_reg
, 0, 4, 1), 4));
777 release_tmp(c
, const2_reg
);
780 /* replicate lower four floats into upper half (to get XYZWXYZW) */
781 const_reg
= stride(const_reg
, 0, 4, 0);
790 /* TODO: relative addressing!
792 static struct brw_reg
get_reg( struct brw_vs_compile
*c
,
793 gl_register_file file
,
797 case PROGRAM_TEMPORARY
:
800 assert(c
->regs
[file
][index
].nr
!= 0);
801 return c
->regs
[file
][index
];
802 case PROGRAM_STATE_VAR
:
803 case PROGRAM_CONSTANT
:
804 case PROGRAM_UNIFORM
:
805 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
806 return c
->regs
[PROGRAM_STATE_VAR
][index
];
807 case PROGRAM_ADDRESS
:
809 return c
->regs
[file
][index
];
811 case PROGRAM_UNDEFINED
: /* undef values */
812 return brw_null_reg();
814 case PROGRAM_LOCAL_PARAM
:
815 case PROGRAM_ENV_PARAM
:
816 case PROGRAM_WRITE_ONLY
:
819 return brw_null_reg();
825 * Indirect addressing: get reg[[arg] + offset].
827 static struct brw_reg
deref( struct brw_vs_compile
*c
,
831 struct brw_compile
*p
= &c
->func
;
832 struct brw_reg tmp
= vec4(get_tmp(c
));
833 struct brw_reg addr_reg
= c
->regs
[PROGRAM_ADDRESS
][0];
834 struct brw_reg vp_address
= retype(vec1(addr_reg
), BRW_REGISTER_TYPE_UW
);
835 GLuint byte_offset
= arg
.nr
* 32 + arg
.subnr
+ offset
* 16;
836 struct brw_reg indirect
= brw_vec4_indirect(0,0);
839 brw_push_insn_state(p
);
840 brw_set_access_mode(p
, BRW_ALIGN_1
);
842 /* This is pretty clunky - load the address register twice and
843 * fetch each 4-dword value in turn. There must be a way to do
844 * this in a single pass, but I couldn't get it to work.
846 brw_ADD(p
, brw_address_reg(0), vp_address
, brw_imm_d(byte_offset
));
847 brw_MOV(p
, tmp
, indirect
);
849 brw_ADD(p
, brw_address_reg(0), suboffset(vp_address
, 8), brw_imm_d(byte_offset
));
850 brw_MOV(p
, suboffset(tmp
, 4), indirect
);
852 brw_pop_insn_state(p
);
855 /* NOTE: tmp not released */
861 * Get brw reg corresponding to the instruction's [argIndex] src reg.
862 * TODO: relative addressing!
864 static struct brw_reg
865 get_src_reg( struct brw_vs_compile
*c
,
866 const struct prog_instruction
*inst
,
869 const GLuint file
= inst
->SrcReg
[argIndex
].File
;
870 const GLint index
= inst
->SrcReg
[argIndex
].Index
;
871 const GLboolean relAddr
= inst
->SrcReg
[argIndex
].RelAddr
;
874 case PROGRAM_TEMPORARY
:
878 return deref(c
, c
->regs
[file
][0], index
);
881 assert(c
->regs
[file
][index
].nr
!= 0);
882 return c
->regs
[file
][index
];
885 case PROGRAM_STATE_VAR
:
886 case PROGRAM_CONSTANT
:
887 case PROGRAM_UNIFORM
:
888 if (c
->vp
->use_const_buffer
) {
889 return get_constant(c
, inst
, argIndex
);
892 return deref(c
, c
->regs
[PROGRAM_STATE_VAR
][0], index
);
895 assert(c
->regs
[PROGRAM_STATE_VAR
][index
].nr
!= 0);
896 return c
->regs
[PROGRAM_STATE_VAR
][index
];
898 case PROGRAM_ADDRESS
:
900 return c
->regs
[file
][index
];
902 case PROGRAM_UNDEFINED
:
903 /* this is a normal case since we loop over all three src args */
904 return brw_null_reg();
906 case PROGRAM_LOCAL_PARAM
:
907 case PROGRAM_ENV_PARAM
:
908 case PROGRAM_WRITE_ONLY
:
911 return brw_null_reg();
916 static void emit_arl( struct brw_vs_compile
*c
,
918 struct brw_reg arg0
)
920 struct brw_compile
*p
= &c
->func
;
921 struct brw_reg tmp
= dst
;
922 GLboolean need_tmp
= (dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
927 brw_RNDD(p
, tmp
, arg0
); /* tmp = round(arg0) */
928 brw_MUL(p
, dst
, tmp
, brw_imm_d(16)); /* dst = tmp * 16 */
936 * Return the brw reg for the given instruction's src argument.
937 * Will return mangled results for SWZ op. The emit_swz() function
938 * ignores this result and recalculates taking extended swizzles into
941 static struct brw_reg
get_arg( struct brw_vs_compile
*c
,
942 const struct prog_instruction
*inst
,
945 const struct prog_src_register
*src
= &inst
->SrcReg
[argIndex
];
948 if (src
->File
== PROGRAM_UNDEFINED
)
949 return brw_null_reg();
951 reg
= get_src_reg(c
, inst
, argIndex
);
953 /* Convert 3-bit swizzle to 2-bit.
955 reg
.dw1
.bits
.swizzle
= BRW_SWIZZLE4(GET_SWZ(src
->Swizzle
, 0),
956 GET_SWZ(src
->Swizzle
, 1),
957 GET_SWZ(src
->Swizzle
, 2),
958 GET_SWZ(src
->Swizzle
, 3));
960 /* Note this is ok for non-swizzle instructions:
962 reg
.negate
= src
->Negate
? 1 : 0;
969 * Get brw register for the given program dest register.
971 static struct brw_reg
get_dst( struct brw_vs_compile
*c
,
972 struct prog_dst_register dst
)
977 case PROGRAM_TEMPORARY
:
979 assert(c
->regs
[dst
.File
][dst
.Index
].nr
!= 0);
980 reg
= c
->regs
[dst
.File
][dst
.Index
];
982 case PROGRAM_ADDRESS
:
983 assert(dst
.Index
== 0);
984 reg
= c
->regs
[dst
.File
][dst
.Index
];
986 case PROGRAM_UNDEFINED
:
987 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
988 reg
= brw_null_reg();
992 reg
= brw_null_reg();
995 reg
.dw1
.bits
.writemask
= dst
.WriteMask
;
1001 static void emit_swz( struct brw_vs_compile
*c
,
1003 const struct prog_instruction
*inst
)
1005 const GLuint argIndex
= 0;
1006 const struct prog_src_register src
= inst
->SrcReg
[argIndex
];
1007 struct brw_compile
*p
= &c
->func
;
1008 GLuint zeros_mask
= 0;
1009 GLuint ones_mask
= 0;
1010 GLuint src_mask
= 0;
1012 GLboolean need_tmp
= (src
.Negate
&&
1013 dst
.file
!= BRW_GENERAL_REGISTER_FILE
);
1014 struct brw_reg tmp
= dst
;
1020 for (i
= 0; i
< 4; i
++) {
1021 if (dst
.dw1
.bits
.writemask
& (1<<i
)) {
1022 GLubyte s
= GET_SWZ(src
.Swizzle
, i
);
1041 /* Do src first, in case dst aliases src:
1044 struct brw_reg arg0
;
1046 arg0
= get_src_reg(c
, inst
, argIndex
);
1048 arg0
= brw_swizzle(arg0
,
1049 src_swz
[0], src_swz
[1],
1050 src_swz
[2], src_swz
[3]);
1052 brw_MOV(p
, brw_writemask(tmp
, src_mask
), arg0
);
1056 brw_MOV(p
, brw_writemask(tmp
, zeros_mask
), brw_imm_f(0));
1059 brw_MOV(p
, brw_writemask(tmp
, ones_mask
), brw_imm_f(1));
1062 brw_MOV(p
, brw_writemask(tmp
, src
.Negate
), negate(tmp
));
1065 brw_MOV(p
, dst
, tmp
);
1066 release_tmp(c
, tmp
);
1072 * Post-vertex-program processing. Send the results to the URB.
1074 static void emit_vertex_write( struct brw_vs_compile
*c
)
1076 struct brw_compile
*p
= &c
->func
;
1077 struct brw_reg m0
= brw_message_reg(0);
1078 struct brw_reg pos
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_HPOS
];
1082 if (c
->key
.copy_edgeflag
) {
1084 get_reg(c
, PROGRAM_OUTPUT
, VERT_RESULT_EDGE
),
1085 get_reg(c
, PROGRAM_INPUT
, VERT_ATTRIB_EDGEFLAG
));
1088 /* Build ndc coords */
1090 /* ndc = 1.0 / pos.w */
1091 emit_math1(c
, BRW_MATH_FUNCTION_INV
, ndc
, brw_swizzle1(pos
, 3), BRW_MATH_PRECISION_FULL
);
1092 /* ndc.xyz = pos * ndc */
1093 brw_MUL(p
, brw_writemask(ndc
, WRITEMASK_XYZ
), pos
, ndc
);
1095 /* Update the header for point size, user clipping flags, and -ve rhw
1098 if ((c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) ||
1099 c
->key
.nr_userclip
|| !BRW_IS_G4X(p
->brw
))
1101 struct brw_reg header1
= retype(get_tmp(c
), BRW_REGISTER_TYPE_UD
);
1104 brw_MOV(p
, header1
, brw_imm_ud(0));
1106 brw_set_access_mode(p
, BRW_ALIGN_16
);
1108 if (c
->prog_data
.outputs_written
& (1<<VERT_RESULT_PSIZ
)) {
1109 struct brw_reg psiz
= c
->regs
[PROGRAM_OUTPUT
][VERT_RESULT_PSIZ
];
1110 brw_MUL(p
, brw_writemask(header1
, WRITEMASK_W
), brw_swizzle1(psiz
, 0), brw_imm_f(1<<11));
1111 brw_AND(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(0x7ff<<8));
1114 for (i
= 0; i
< c
->key
.nr_userclip
; i
++) {
1115 brw_set_conditionalmod(p
, BRW_CONDITIONAL_L
);
1116 brw_DP4(p
, brw_null_reg(), pos
, c
->userplane
[i
]);
1117 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<i
));
1118 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1121 /* i965 clipping workaround:
1122 * 1) Test for -ve rhw
1124 * set ndc = (0,0,0,0)
1127 * Later, clipping will detect ucp[6] and ensure the primitive is
1128 * clipped against all fixed planes.
1130 if (!BRW_IS_G4X(p
->brw
)) {
1132 vec8(brw_null_reg()),
1134 brw_swizzle1(ndc
, 3),
1137 brw_OR(p
, brw_writemask(header1
, WRITEMASK_W
), header1
, brw_imm_ud(1<<6));
1138 brw_MOV(p
, ndc
, brw_imm_f(0));
1139 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1142 brw_set_access_mode(p
, BRW_ALIGN_1
); /* why? */
1143 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), header1
);
1144 brw_set_access_mode(p
, BRW_ALIGN_16
);
1146 release_tmp(c
, header1
);
1149 brw_MOV(p
, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD
), brw_imm_ud(0));
1152 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1153 * of zeros followed by two sets of NDC coordinates:
1155 brw_set_access_mode(p
, BRW_ALIGN_1
);
1156 brw_MOV(p
, offset(m0
, 2), ndc
);
1157 brw_MOV(p
, offset(m0
, 3), pos
);
1159 eot
= (c
->first_overflow_output
== 0);
1162 brw_null_reg(), /* dest */
1163 0, /* starting mrf reg nr */
1167 MIN2(c
->nr_outputs
+ 3, (BRW_MAX_MRF
-1)), /* msg len */
1168 0, /* response len */
1170 1, /* writes complete */
1171 0, /* urb destination offset */
1172 BRW_URB_SWIZZLE_INTERLEAVE
);
1174 if (c
->first_overflow_output
> 0) {
1175 /* Not all of the vertex outputs/results fit into the MRF.
1176 * Move the overflowed attributes from the GRF to the MRF and
1177 * issue another brw_urb_WRITE().
1179 /* XXX I'm not 100% sure about which MRF regs to use here. Starting
1183 for (i
= c
->first_overflow_output
; i
< VERT_RESULT_MAX
; i
++) {
1184 if (c
->prog_data
.outputs_written
& (1 << i
)) {
1185 /* move from GRF to MRF */
1186 brw_MOV(p
, brw_message_reg(4+mrf
), c
->regs
[PROGRAM_OUTPUT
][i
]);
1192 brw_null_reg(), /* dest */
1193 4, /* starting mrf reg nr */
1197 mrf
+1, /* msg len */
1198 0, /* response len */
1200 1, /* writes complete */
1201 BRW_MAX_MRF
-1, /* urb destination offset */
1202 BRW_URB_SWIZZLE_INTERLEAVE
);
1208 * Called after code generation to resolve subroutine calls and the
1210 * \param end_inst points to brw code for END instruction
1211 * \param last_inst points to last instruction emitted before vertex write
1214 post_vs_emit( struct brw_vs_compile
*c
,
1215 struct brw_instruction
*end_inst
,
1216 struct brw_instruction
*last_inst
)
1220 brw_resolve_cals(&c
->func
);
1222 /* patch up the END code to jump past subroutines, etc */
1223 offset
= last_inst
- end_inst
;
1224 brw_set_src1(end_inst
, brw_imm_d(offset
* 16));
1228 /* Emit the vertex program instructions here.
1230 void brw_vs_emit(struct brw_vs_compile
*c
)
1232 #define MAX_IF_DEPTH 32
1233 #define MAX_LOOP_DEPTH 32
1234 struct brw_compile
*p
= &c
->func
;
1235 const GLuint nr_insns
= c
->vp
->program
.Base
.NumInstructions
;
1236 GLuint insn
, if_depth
= 0, loop_depth
= 0;
1237 GLuint end_offset
= 0;
1238 struct brw_instruction
*end_inst
, *last_inst
;
1239 struct brw_instruction
*if_inst
[MAX_IF_DEPTH
], *loop_inst
[MAX_LOOP_DEPTH
];
1240 const struct brw_indirect stack_index
= brw_indirect(0, 0);
1244 if (INTEL_DEBUG
& DEBUG_VS
) {
1245 _mesa_printf("vs-emit:\n");
1246 _mesa_print_program(&c
->vp
->program
.Base
);
1250 brw_set_compression_control(p
, BRW_COMPRESSION_NONE
);
1251 brw_set_access_mode(p
, BRW_ALIGN_16
);
1253 /* Message registers can't be read, so copy the output into GRF register
1254 if they are used in source registers */
1255 for (insn
= 0; insn
< nr_insns
; insn
++) {
1257 struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1258 for (i
= 0; i
< 3; i
++) {
1259 struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1260 GLuint index
= src
->Index
;
1261 GLuint file
= src
->File
;
1262 if (file
== PROGRAM_OUTPUT
&& index
!= VERT_RESULT_HPOS
)
1263 c
->output_regs
[index
].used_in_src
= GL_TRUE
;
1267 /* Static register allocation
1269 brw_vs_alloc_regs(c
);
1270 brw_MOV(p
, get_addr_reg(stack_index
), brw_address(c
->stack
));
1272 for (insn
= 0; insn
< nr_insns
; insn
++) {
1274 const struct prog_instruction
*inst
= &c
->vp
->program
.Base
.Instructions
[insn
];
1275 struct brw_reg args
[3], dst
;
1279 printf("%d: ", insn
);
1280 _mesa_print_instruction(inst
);
1283 /* Get argument regs. SWZ is special and does this itself.
1285 if (inst
->Opcode
!= OPCODE_SWZ
)
1286 for (i
= 0; i
< 3; i
++) {
1287 const struct prog_src_register
*src
= &inst
->SrcReg
[i
];
1290 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1291 args
[i
] = c
->output_regs
[index
].reg
;
1293 args
[i
] = get_arg(c
, inst
, i
);
1296 /* Get dest regs. Note that it is possible for a reg to be both
1297 * dst and arg, given the static allocation of registers. So
1298 * care needs to be taken emitting multi-operation instructions.
1300 index
= inst
->DstReg
.Index
;
1301 file
= inst
->DstReg
.File
;
1302 if (file
== PROGRAM_OUTPUT
&& c
->output_regs
[index
].used_in_src
)
1303 dst
= c
->output_regs
[index
].reg
;
1305 dst
= get_dst(c
, inst
->DstReg
);
1307 if (inst
->SaturateMode
!= SATURATE_OFF
) {
1308 _mesa_problem(NULL
, "Unsupported saturate %d in vertex shader",
1309 inst
->SaturateMode
);
1312 switch (inst
->Opcode
) {
1314 brw_MOV(p
, dst
, brw_abs(args
[0]));
1317 brw_ADD(p
, dst
, args
[0], args
[1]);
1320 emit_math1(c
, BRW_MATH_FUNCTION_COS
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1323 brw_DP3(p
, dst
, args
[0], args
[1]);
1326 brw_DP4(p
, dst
, args
[0], args
[1]);
1329 brw_DPH(p
, dst
, args
[0], args
[1]);
1332 emit_nrm(c
, dst
, args
[0], 3);
1335 emit_nrm(c
, dst
, args
[0], 4);
1338 unalias2(c
, dst
, args
[0], args
[1], emit_dst_noalias
);
1341 unalias1(c
, dst
, args
[0], emit_exp_noalias
);
1344 emit_math1(c
, BRW_MATH_FUNCTION_EXP
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1347 emit_arl(c
, dst
, args
[0]);
1350 brw_RNDD(p
, dst
, args
[0]);
1353 brw_FRC(p
, dst
, args
[0]);
1356 unalias1(c
, dst
, args
[0], emit_log_noalias
);
1359 emit_math1(c
, BRW_MATH_FUNCTION_LOG
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1362 unalias1(c
, dst
, args
[0], emit_lit_noalias
);
1365 unalias3(c
, dst
, args
[0], args
[1], args
[2], emit_lrp_noalias
);
1368 brw_MOV(p
, brw_acc_reg(), args
[2]);
1369 brw_MAC(p
, dst
, args
[0], args
[1]);
1372 emit_max(p
, dst
, args
[0], args
[1]);
1375 emit_min(p
, dst
, args
[0], args
[1]);
1378 brw_MOV(p
, dst
, args
[0]);
1381 brw_MUL(p
, dst
, args
[0], args
[1]);
1384 emit_math2(c
, BRW_MATH_FUNCTION_POW
, dst
, args
[0], args
[1], BRW_MATH_PRECISION_FULL
);
1387 emit_math1(c
, BRW_MATH_FUNCTION_INV
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1390 emit_math1(c
, BRW_MATH_FUNCTION_RSQ
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1394 emit_seq(p
, dst
, args
[0], args
[1]);
1397 emit_math1(c
, BRW_MATH_FUNCTION_SIN
, dst
, args
[0], BRW_MATH_PRECISION_FULL
);
1400 emit_sne(p
, dst
, args
[0], args
[1]);
1403 emit_sge(p
, dst
, args
[0], args
[1]);
1406 emit_sgt(p
, dst
, args
[0], args
[1]);
1409 emit_slt(p
, dst
, args
[0], args
[1]);
1412 emit_sle(p
, dst
, args
[0], args
[1]);
1415 brw_ADD(p
, dst
, args
[0], negate(args
[1]));
1418 /* The args[0] value can't be used here as it won't have
1419 * correctly encoded the full swizzle:
1421 emit_swz(c
, dst
, inst
);
1424 /* round toward zero */
1425 brw_RNDZ(p
, dst
, args
[0]);
1428 emit_xpd(p
, dst
, args
[0], args
[1]);
1431 assert(if_depth
< MAX_IF_DEPTH
);
1432 if_inst
[if_depth
++] = brw_IF(p
, BRW_EXECUTE_8
);
1435 if_inst
[if_depth
-1] = brw_ELSE(p
, if_inst
[if_depth
-1]);
1438 assert(if_depth
> 0);
1439 brw_ENDIF(p
, if_inst
[--if_depth
]);
1442 case OPCODE_BGNLOOP
:
1443 loop_inst
[loop_depth
++] = brw_DO(p
, BRW_EXECUTE_8
);
1447 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1451 brw_set_predicate_control(p
, BRW_PREDICATE_NONE
);
1453 case OPCODE_ENDLOOP
:
1455 struct brw_instruction
*inst0
, *inst1
;
1457 inst0
= inst1
= brw_WHILE(p
, loop_inst
[loop_depth
]);
1458 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1459 while (inst0
> loop_inst
[loop_depth
]) {
1461 if (inst0
->header
.opcode
== BRW_OPCODE_BREAK
) {
1462 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
+ 1;
1463 inst0
->bits3
.if_else
.pop_count
= 0;
1465 else if (inst0
->header
.opcode
== BRW_OPCODE_CONTINUE
) {
1466 inst0
->bits3
.if_else
.jump_count
= inst1
- inst0
;
1467 inst0
->bits3
.if_else
.pop_count
= 0;
1477 brw_set_predicate_control(p
, BRW_PREDICATE_NORMAL
);
1478 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1479 brw_set_predicate_control_flag_value(p
, 0xff);
1482 brw_set_access_mode(p
, BRW_ALIGN_1
);
1483 brw_ADD(p
, deref_1d(stack_index
, 0), brw_ip_reg(), brw_imm_d(3*16));
1484 brw_set_access_mode(p
, BRW_ALIGN_16
);
1485 brw_ADD(p
, get_addr_reg(stack_index
),
1486 get_addr_reg(stack_index
), brw_imm_d(4));
1487 brw_save_call(p
, inst
->Comment
, p
->nr_insn
);
1488 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1491 brw_ADD(p
, get_addr_reg(stack_index
),
1492 get_addr_reg(stack_index
), brw_imm_d(-4));
1493 brw_set_access_mode(p
, BRW_ALIGN_1
);
1494 brw_MOV(p
, brw_ip_reg(), deref_1d(stack_index
, 0));
1495 brw_set_access_mode(p
, BRW_ALIGN_16
);
1498 end_offset
= p
->nr_insn
;
1499 /* this instruction will get patched later to jump past subroutine
1502 brw_ADD(p
, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1508 brw_save_label(p
, inst
->Comment
, p
->nr_insn
);
1514 _mesa_problem(NULL
, "Unsupported opcode %i (%s) in vertex shader",
1515 inst
->Opcode
, inst
->Opcode
< MAX_OPCODE
?
1516 _mesa_opcode_string(inst
->Opcode
) :
1520 if ((inst
->DstReg
.File
== PROGRAM_OUTPUT
)
1521 && (inst
->DstReg
.Index
!= VERT_RESULT_HPOS
)
1522 && c
->output_regs
[inst
->DstReg
.Index
].used_in_src
) {
1523 brw_MOV(p
, get_dst(c
, inst
->DstReg
), dst
);
1526 /* Result color clamping.
1528 * When destination register is an output register and
1529 * it's primary/secondary front/back color, we have to clamp
1530 * the result to [0,1]. This is done by enabling the
1531 * saturation bit for the last instruction.
1533 * We don't use brw_set_saturate() as it modifies
1534 * p->current->header.saturate, which affects all the subsequent
1535 * instructions. Instead, we directly modify the header
1536 * of the last (already stored) instruction.
1538 if (inst
->DstReg
.File
== PROGRAM_OUTPUT
) {
1539 if ((inst
->DstReg
.Index
== VERT_RESULT_COL0
)
1540 || (inst
->DstReg
.Index
== VERT_RESULT_COL1
)
1541 || (inst
->DstReg
.Index
== VERT_RESULT_BFC0
)
1542 || (inst
->DstReg
.Index
== VERT_RESULT_BFC1
)) {
1543 p
->store
[p
->nr_insn
-1].header
.saturate
= 1;
1550 end_inst
= &p
->store
[end_offset
];
1551 last_inst
= &p
->store
[p
->nr_insn
];
1553 /* The END instruction will be patched to jump to this code */
1554 emit_vertex_write(c
);
1556 post_vs_emit(c
, end_inst
, last_inst
);