Merge branch 'glapi-reorg'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146
147 /* Determine whether to use a real constant buffer or use a block
148 * of GRF registers for constants. The later is faster but only
149 * works if everything fits in the GRF.
150 * XXX this heuristic/check may need some fine tuning...
151 */
152 if (c->vp->program.Base.Parameters->NumParameters +
153 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
154 c->vp->use_const_buffer = GL_TRUE;
155 else
156 c->vp->use_const_buffer = GL_FALSE;
157
158 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
159
160 /* r0 -- reserved as usual
161 */
162 c->r0 = brw_vec8_grf(reg, 0);
163 reg++;
164
165 /* User clip planes from curbe:
166 */
167 if (c->key.nr_userclip) {
168 if (intel->gen >= 6) {
169 for (i = 0; i < c->key.nr_userclip; i++) {
170 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
171 (i % 2) * 4), 0, 4, 1);
172 }
173 reg += ALIGN(c->key.nr_userclip, 2) / 2;
174 } else {
175 for (i = 0; i < c->key.nr_userclip; i++) {
176 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
177 (i % 2) * 4), 0, 4, 1);
178 }
179 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
180 }
181
182 }
183
184 /* Vertex program parameters from curbe:
185 */
186 if (c->vp->use_const_buffer) {
187 int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
188 int constant = 0;
189
190 /* We've got more constants than we can load with the push
191 * mechanism. This is often correlated with reladdr loads where
192 * we should probably be using a pull mechanism anyway to avoid
193 * excessive reading. However, the pull mechanism is slow in
194 * general. So, we try to allocate as many non-reladdr-loaded
195 * constants through the push buffer as we can before giving up.
196 */
197 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
198 for (i = 0;
199 i < c->vp->program.Base.NumInstructions && constant < max_constant;
200 i++) {
201 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
202 int arg;
203
204 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
205 if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
206 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
207 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
208 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
209 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
210 inst->SrcReg[arg].RelAddr)
211 continue;
212
213 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
214 c->constant_map[inst->SrcReg[arg].Index] = constant++;
215 }
216 }
217 }
218
219 for (i = 0; i < constant; i++) {
220 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
221 (i%2) * 4),
222 0, 4, 1);
223 }
224 reg += (constant + 1) / 2;
225 c->prog_data.curb_read_length = reg - 1;
226 /* XXX 0 causes a bug elsewhere... */
227 c->prog_data.nr_params = MAX2(constant * 4, 4);
228 }
229 else {
230 /* use a section of the GRF for constants */
231 GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
232 for (i = 0; i < nr_params; i++) {
233 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
234 }
235 reg += (nr_params + 1) / 2;
236 c->prog_data.curb_read_length = reg - 1;
237
238 c->prog_data.nr_params = nr_params * 4;
239 }
240
241 /* Allocate input regs:
242 */
243 c->nr_inputs = 0;
244 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
245 if (c->prog_data.inputs_read & (1 << i)) {
246 c->nr_inputs++;
247 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
248 reg++;
249 }
250 }
251 /* If there are no inputs, we'll still be reading one attribute's worth
252 * because it's required -- see urb_read_length setting.
253 */
254 if (c->nr_inputs == 0)
255 reg++;
256
257 /* Allocate outputs. The non-position outputs go straight into message regs.
258 */
259 c->nr_outputs = 0;
260 c->first_output = reg;
261 c->first_overflow_output = 0;
262
263 if (intel->gen >= 6) {
264 mrf = 3;
265 if (c->key.nr_userclip)
266 mrf += 2;
267 } else if (intel->gen == 5)
268 mrf = 8;
269 else
270 mrf = 4;
271
272 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
273 for (i = 0; i < VERT_RESULT_MAX; i++) {
274 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
275 c->nr_outputs++;
276 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
277 if (i == VERT_RESULT_HPOS) {
278 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
279 reg++;
280 }
281 else if (i == VERT_RESULT_PSIZ) {
282 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
283 reg++;
284 mrf++; /* just a placeholder? XXX fix later stages & remove this */
285 }
286 else {
287 /* Two restrictions on our compute-to-MRF here. The
288 * message length for all SEND messages is restricted to
289 * [1,15], so we can't use mrf 15, as that means a length
290 * of 16.
291 *
292 * Additionally, URB writes are aligned to URB rows, so we
293 * need to put an even number of registers of URB data in
294 * each URB write so that the later write is aligned. A
295 * message length of 15 means 1 message header reg plus 14
296 * regs of URB data.
297 *
298 * For attributes beyond the compute-to-MRF, we compute to
299 * GRFs and they will be written in the second URB_WRITE.
300 */
301 if (first_reladdr_output > i && mrf < 15) {
302 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
303 mrf++;
304 }
305 else {
306 if (mrf >= 15 && !c->first_overflow_output)
307 c->first_overflow_output = i;
308 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
309 reg++;
310 mrf++;
311 }
312 }
313 }
314 }
315
316 /* Allocate program temporaries:
317 */
318 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
319 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
320 reg++;
321 }
322
323 /* Address reg(s). Don't try to use the internal address reg until
324 * deref time.
325 */
326 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
327 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
328 reg,
329 0,
330 BRW_REGISTER_TYPE_D,
331 BRW_VERTICAL_STRIDE_8,
332 BRW_WIDTH_8,
333 BRW_HORIZONTAL_STRIDE_1,
334 BRW_SWIZZLE_XXXX,
335 WRITEMASK_X);
336 reg++;
337 }
338
339 if (c->vp->use_const_buffer) {
340 for (i = 0; i < 3; i++) {
341 c->current_const[i].reg = brw_vec8_grf(reg, 0);
342 reg++;
343 }
344 clear_current_const(c);
345 }
346
347 for (i = 0; i < 128; i++) {
348 if (c->output_regs[i].used_in_src) {
349 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
350 reg++;
351 }
352 }
353
354 if (c->needs_stack) {
355 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
356 reg += 2;
357 }
358
359 /* Some opcodes need an internal temporary:
360 */
361 c->first_tmp = reg;
362 c->last_tmp = reg; /* for allocation purposes */
363
364 /* Each input reg holds data from two vertices. The
365 * urb_read_length is the number of registers read from *each*
366 * vertex urb, so is half the amount:
367 */
368 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
369 /* Setting this field to 0 leads to undefined behavior according to the
370 * the VS_STATE docs. Our VUEs will always have at least one attribute
371 * sitting in them, even if it's padding.
372 */
373 if (c->prog_data.urb_read_length == 0)
374 c->prog_data.urb_read_length = 1;
375
376 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
377 * them to fit the biggest thing they need to.
378 */
379 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
380
381 /* See emit_vertex_write() for where the VUE's overhead on top of the
382 * attributes comes from.
383 */
384 if (intel->gen >= 6) {
385 int header_regs = 2;
386 if (c->key.nr_userclip)
387 header_regs += 2;
388
389 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
390 } else if (intel->gen == 5)
391 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
392 else
393 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
394
395 c->prog_data.total_grf = reg;
396
397 if (INTEL_DEBUG & DEBUG_VS) {
398 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
399 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
400 printf("%s reg = %d\n", __FUNCTION__, reg);
401 }
402 }
403
404
405 /**
406 * If an instruction uses a temp reg both as a src and the dest, we
407 * sometimes need to allocate an intermediate temporary.
408 */
409 static void unalias1( struct brw_vs_compile *c,
410 struct brw_reg dst,
411 struct brw_reg arg0,
412 void (*func)( struct brw_vs_compile *,
413 struct brw_reg,
414 struct brw_reg ))
415 {
416 if (dst.file == arg0.file && dst.nr == arg0.nr) {
417 struct brw_compile *p = &c->func;
418 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
419 func(c, tmp, arg0);
420 brw_MOV(p, dst, tmp);
421 release_tmp(c, tmp);
422 }
423 else {
424 func(c, dst, arg0);
425 }
426 }
427
428 /**
429 * \sa unalias2
430 * Checkes if 2-operand instruction needs an intermediate temporary.
431 */
432 static void unalias2( struct brw_vs_compile *c,
433 struct brw_reg dst,
434 struct brw_reg arg0,
435 struct brw_reg arg1,
436 void (*func)( struct brw_vs_compile *,
437 struct brw_reg,
438 struct brw_reg,
439 struct brw_reg ))
440 {
441 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
442 (dst.file == arg1.file && dst.nr == arg1.nr)) {
443 struct brw_compile *p = &c->func;
444 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
445 func(c, tmp, arg0, arg1);
446 brw_MOV(p, dst, tmp);
447 release_tmp(c, tmp);
448 }
449 else {
450 func(c, dst, arg0, arg1);
451 }
452 }
453
454 /**
455 * \sa unalias2
456 * Checkes if 3-operand instruction needs an intermediate temporary.
457 */
458 static void unalias3( struct brw_vs_compile *c,
459 struct brw_reg dst,
460 struct brw_reg arg0,
461 struct brw_reg arg1,
462 struct brw_reg arg2,
463 void (*func)( struct brw_vs_compile *,
464 struct brw_reg,
465 struct brw_reg,
466 struct brw_reg,
467 struct brw_reg ))
468 {
469 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
470 (dst.file == arg1.file && dst.nr == arg1.nr) ||
471 (dst.file == arg2.file && dst.nr == arg2.nr)) {
472 struct brw_compile *p = &c->func;
473 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
474 func(c, tmp, arg0, arg1, arg2);
475 brw_MOV(p, dst, tmp);
476 release_tmp(c, tmp);
477 }
478 else {
479 func(c, dst, arg0, arg1, arg2);
480 }
481 }
482
483 static void emit_sop( struct brw_vs_compile *c,
484 struct brw_reg dst,
485 struct brw_reg arg0,
486 struct brw_reg arg1,
487 GLuint cond)
488 {
489 struct brw_compile *p = &c->func;
490
491 brw_MOV(p, dst, brw_imm_f(0.0f));
492 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
493 brw_MOV(p, dst, brw_imm_f(1.0f));
494 brw_set_predicate_control_flag_value(p, 0xff);
495 }
496
497 static void emit_seq( struct brw_vs_compile *c,
498 struct brw_reg dst,
499 struct brw_reg arg0,
500 struct brw_reg arg1 )
501 {
502 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
503 }
504
505 static void emit_sne( struct brw_vs_compile *c,
506 struct brw_reg dst,
507 struct brw_reg arg0,
508 struct brw_reg arg1 )
509 {
510 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
511 }
512 static void emit_slt( struct brw_vs_compile *c,
513 struct brw_reg dst,
514 struct brw_reg arg0,
515 struct brw_reg arg1 )
516 {
517 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
518 }
519
520 static void emit_sle( struct brw_vs_compile *c,
521 struct brw_reg dst,
522 struct brw_reg arg0,
523 struct brw_reg arg1 )
524 {
525 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
526 }
527
528 static void emit_sgt( struct brw_vs_compile *c,
529 struct brw_reg dst,
530 struct brw_reg arg0,
531 struct brw_reg arg1 )
532 {
533 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
534 }
535
536 static void emit_sge( struct brw_vs_compile *c,
537 struct brw_reg dst,
538 struct brw_reg arg0,
539 struct brw_reg arg1 )
540 {
541 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
542 }
543
544 static void emit_cmp( struct brw_compile *p,
545 struct brw_reg dst,
546 struct brw_reg arg0,
547 struct brw_reg arg1,
548 struct brw_reg arg2 )
549 {
550 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
551 brw_SEL(p, dst, arg1, arg2);
552 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
553 }
554
555 static void emit_sign(struct brw_vs_compile *c,
556 struct brw_reg dst,
557 struct brw_reg arg0)
558 {
559 struct brw_compile *p = &c->func;
560
561 brw_MOV(p, dst, brw_imm_f(0));
562
563 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
564 brw_MOV(p, dst, brw_imm_f(-1.0));
565 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
566
567 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
568 brw_MOV(p, dst, brw_imm_f(1.0));
569 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
570 }
571
572 static void emit_max( struct brw_compile *p,
573 struct brw_reg dst,
574 struct brw_reg arg0,
575 struct brw_reg arg1 )
576 {
577 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
578 brw_SEL(p, dst, arg0, arg1);
579 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
580 }
581
582 static void emit_min( struct brw_compile *p,
583 struct brw_reg dst,
584 struct brw_reg arg0,
585 struct brw_reg arg1 )
586 {
587 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
588 brw_SEL(p, dst, arg0, arg1);
589 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
590 }
591
592
593 static void emit_math1( struct brw_vs_compile *c,
594 GLuint function,
595 struct brw_reg dst,
596 struct brw_reg arg0,
597 GLuint precision)
598 {
599 /* There are various odd behaviours with SEND on the simulator. In
600 * addition there are documented issues with the fact that the GEN4
601 * processor doesn't do dependency control properly on SEND
602 * results. So, on balance, this kludge to get around failures
603 * with writemasked math results looks like it might be necessary
604 * whether that turns out to be a simulator bug or not:
605 */
606 struct brw_compile *p = &c->func;
607 struct intel_context *intel = &p->brw->intel;
608 struct brw_reg tmp = dst;
609 GLboolean need_tmp = GL_FALSE;
610
611 if (dst.file != BRW_GENERAL_REGISTER_FILE)
612 need_tmp = GL_TRUE;
613
614 if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
615 need_tmp = GL_TRUE;
616
617 if (need_tmp)
618 tmp = get_tmp(c);
619
620 brw_math(p,
621 tmp,
622 function,
623 BRW_MATH_SATURATE_NONE,
624 2,
625 arg0,
626 BRW_MATH_DATA_SCALAR,
627 precision);
628
629 if (need_tmp) {
630 brw_MOV(p, dst, tmp);
631 release_tmp(c, tmp);
632 }
633 }
634
635
636 static void emit_math2( struct brw_vs_compile *c,
637 GLuint function,
638 struct brw_reg dst,
639 struct brw_reg arg0,
640 struct brw_reg arg1,
641 GLuint precision)
642 {
643 struct brw_compile *p = &c->func;
644 struct intel_context *intel = &p->brw->intel;
645 struct brw_reg tmp = dst;
646 GLboolean need_tmp = GL_FALSE;
647
648 if (dst.file != BRW_GENERAL_REGISTER_FILE)
649 need_tmp = GL_TRUE;
650
651 if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
652 need_tmp = GL_TRUE;
653
654 if (need_tmp)
655 tmp = get_tmp(c);
656
657 brw_MOV(p, brw_message_reg(3), arg1);
658
659 brw_math(p,
660 tmp,
661 function,
662 BRW_MATH_SATURATE_NONE,
663 2,
664 arg0,
665 BRW_MATH_DATA_SCALAR,
666 precision);
667
668 if (need_tmp) {
669 brw_MOV(p, dst, tmp);
670 release_tmp(c, tmp);
671 }
672 }
673
674
675 static void emit_exp_noalias( struct brw_vs_compile *c,
676 struct brw_reg dst,
677 struct brw_reg arg0 )
678 {
679 struct brw_compile *p = &c->func;
680
681
682 if (dst.dw1.bits.writemask & WRITEMASK_X) {
683 struct brw_reg tmp = get_tmp(c);
684 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
685
686 /* tmp_d = floor(arg0.x) */
687 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
688
689 /* result[0] = 2.0 ^ tmp */
690
691 /* Adjust exponent for floating point:
692 * exp += 127
693 */
694 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
695
696 /* Install exponent and sign.
697 * Excess drops off the edge:
698 */
699 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
700 tmp_d, brw_imm_d(23));
701
702 release_tmp(c, tmp);
703 }
704
705 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
706 /* result[1] = arg0.x - floor(arg0.x) */
707 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
708 }
709
710 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
711 /* As with the LOG instruction, we might be better off just
712 * doing a taylor expansion here, seeing as we have to do all
713 * the prep work.
714 *
715 * If mathbox partial precision is too low, consider also:
716 * result[3] = result[0] * EXP(result[1])
717 */
718 emit_math1(c,
719 BRW_MATH_FUNCTION_EXP,
720 brw_writemask(dst, WRITEMASK_Z),
721 brw_swizzle1(arg0, 0),
722 BRW_MATH_PRECISION_FULL);
723 }
724
725 if (dst.dw1.bits.writemask & WRITEMASK_W) {
726 /* result[3] = 1.0; */
727 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
728 }
729 }
730
731
732 static void emit_log_noalias( struct brw_vs_compile *c,
733 struct brw_reg dst,
734 struct brw_reg arg0 )
735 {
736 struct brw_compile *p = &c->func;
737 struct brw_reg tmp = dst;
738 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
739 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
740 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
741 dst.file != BRW_GENERAL_REGISTER_FILE);
742
743 if (need_tmp) {
744 tmp = get_tmp(c);
745 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
746 }
747
748 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
749 * according to spec:
750 *
751 * These almost look likey they could be joined up, but not really
752 * practical:
753 *
754 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
755 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
756 */
757 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
758 brw_AND(p,
759 brw_writemask(tmp_ud, WRITEMASK_X),
760 brw_swizzle1(arg0_ud, 0),
761 brw_imm_ud((1U<<31)-1));
762
763 brw_SHR(p,
764 brw_writemask(tmp_ud, WRITEMASK_X),
765 tmp_ud,
766 brw_imm_ud(23));
767
768 brw_ADD(p,
769 brw_writemask(tmp, WRITEMASK_X),
770 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
771 brw_imm_d(-127));
772 }
773
774 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
775 brw_AND(p,
776 brw_writemask(tmp_ud, WRITEMASK_Y),
777 brw_swizzle1(arg0_ud, 0),
778 brw_imm_ud((1<<23)-1));
779
780 brw_OR(p,
781 brw_writemask(tmp_ud, WRITEMASK_Y),
782 tmp_ud,
783 brw_imm_ud(127<<23));
784 }
785
786 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
787 /* result[2] = result[0] + LOG2(result[1]); */
788
789 /* Why bother? The above is just a hint how to do this with a
790 * taylor series. Maybe we *should* use a taylor series as by
791 * the time all the above has been done it's almost certainly
792 * quicker than calling the mathbox, even with low precision.
793 *
794 * Options are:
795 * - result[0] + mathbox.LOG2(result[1])
796 * - mathbox.LOG2(arg0.x)
797 * - result[0] + inline_taylor_approx(result[1])
798 */
799 emit_math1(c,
800 BRW_MATH_FUNCTION_LOG,
801 brw_writemask(tmp, WRITEMASK_Z),
802 brw_swizzle1(tmp, 1),
803 BRW_MATH_PRECISION_FULL);
804
805 brw_ADD(p,
806 brw_writemask(tmp, WRITEMASK_Z),
807 brw_swizzle1(tmp, 2),
808 brw_swizzle1(tmp, 0));
809 }
810
811 if (dst.dw1.bits.writemask & WRITEMASK_W) {
812 /* result[3] = 1.0; */
813 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
814 }
815
816 if (need_tmp) {
817 brw_MOV(p, dst, tmp);
818 release_tmp(c, tmp);
819 }
820 }
821
822
823 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
824 */
825 static void emit_dst_noalias( struct brw_vs_compile *c,
826 struct brw_reg dst,
827 struct brw_reg arg0,
828 struct brw_reg arg1)
829 {
830 struct brw_compile *p = &c->func;
831
832 /* There must be a better way to do this:
833 */
834 if (dst.dw1.bits.writemask & WRITEMASK_X)
835 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
836 if (dst.dw1.bits.writemask & WRITEMASK_Y)
837 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
838 if (dst.dw1.bits.writemask & WRITEMASK_Z)
839 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
840 if (dst.dw1.bits.writemask & WRITEMASK_W)
841 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
842 }
843
844
845 static void emit_xpd( struct brw_compile *p,
846 struct brw_reg dst,
847 struct brw_reg t,
848 struct brw_reg u)
849 {
850 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
851 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
852 }
853
854
855 static void emit_lit_noalias( struct brw_vs_compile *c,
856 struct brw_reg dst,
857 struct brw_reg arg0 )
858 {
859 struct brw_compile *p = &c->func;
860 struct brw_instruction *if_insn;
861 struct brw_reg tmp = dst;
862 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
863
864 if (need_tmp)
865 tmp = get_tmp(c);
866
867 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
868 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
869
870 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
871 * to get all channels active inside the IF. In the clipping code
872 * we run with NoMask, so it's not an option and we can use
873 * BRW_EXECUTE_1 for all comparisions.
874 */
875 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
876 if_insn = brw_IF(p, BRW_EXECUTE_8);
877 {
878 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
879
880 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
881 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
882 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
883
884 emit_math2(c,
885 BRW_MATH_FUNCTION_POW,
886 brw_writemask(dst, WRITEMASK_Z),
887 brw_swizzle1(tmp, 2),
888 brw_swizzle1(arg0, 3),
889 BRW_MATH_PRECISION_PARTIAL);
890 }
891
892 brw_ENDIF(p, if_insn);
893
894 release_tmp(c, tmp);
895 }
896
897 static void emit_lrp_noalias(struct brw_vs_compile *c,
898 struct brw_reg dst,
899 struct brw_reg arg0,
900 struct brw_reg arg1,
901 struct brw_reg arg2)
902 {
903 struct brw_compile *p = &c->func;
904
905 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
906 brw_MUL(p, brw_null_reg(), dst, arg2);
907 brw_MAC(p, dst, arg0, arg1);
908 }
909
910 /** 3 or 4-component vector normalization */
911 static void emit_nrm( struct brw_vs_compile *c,
912 struct brw_reg dst,
913 struct brw_reg arg0,
914 int num_comps)
915 {
916 struct brw_compile *p = &c->func;
917 struct brw_reg tmp = get_tmp(c);
918
919 /* tmp = dot(arg0, arg0) */
920 if (num_comps == 3)
921 brw_DP3(p, tmp, arg0, arg0);
922 else
923 brw_DP4(p, tmp, arg0, arg0);
924
925 /* tmp = 1 / sqrt(tmp) */
926 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
927
928 /* dst = arg0 * tmp */
929 brw_MUL(p, dst, arg0, tmp);
930
931 release_tmp(c, tmp);
932 }
933
934
935 static struct brw_reg
936 get_constant(struct brw_vs_compile *c,
937 const struct prog_instruction *inst,
938 GLuint argIndex)
939 {
940 const struct prog_src_register *src = &inst->SrcReg[argIndex];
941 struct brw_compile *p = &c->func;
942 struct brw_reg const_reg = c->current_const[argIndex].reg;
943
944 assert(argIndex < 3);
945
946 assert(c->func.brw->intel.gen < 6); /* FINISHME */
947
948 if (c->current_const[argIndex].index != src->Index) {
949 /* Keep track of the last constant loaded in this slot, for reuse. */
950 c->current_const[argIndex].index = src->Index;
951
952 #if 0
953 printf(" fetch const[%d] for arg %d into reg %d\n",
954 src->Index, argIndex, c->current_const[argIndex].reg.nr);
955 #endif
956 /* need to fetch the constant now */
957 brw_dp_READ_4_vs(p,
958 const_reg, /* writeback dest */
959 16 * src->Index, /* byte offset */
960 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
961 );
962 }
963
964 /* replicate lower four floats into upper half (to get XYZWXYZW) */
965 const_reg = stride(const_reg, 0, 4, 0);
966 const_reg.subnr = 0;
967
968 return const_reg;
969 }
970
971 static struct brw_reg
972 get_reladdr_constant(struct brw_vs_compile *c,
973 const struct prog_instruction *inst,
974 GLuint argIndex)
975 {
976 const struct prog_src_register *src = &inst->SrcReg[argIndex];
977 struct brw_compile *p = &c->func;
978 struct brw_reg const_reg = c->current_const[argIndex].reg;
979 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
980 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
981
982 assert(argIndex < 3);
983
984 assert(c->func.brw->intel.gen < 6); /* FINISHME */
985
986 /* Can't reuse a reladdr constant load. */
987 c->current_const[argIndex].index = -1;
988
989 #if 0
990 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
991 src->Index, argIndex, c->current_const[argIndex].reg.nr);
992 #endif
993
994 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
995
996 /* fetch the first vec4 */
997 brw_dp_READ_4_vs_relative(p,
998 const_reg, /* writeback dest */
999 byte_addr_reg, /* address register */
1000 16 * src->Index, /* byte offset */
1001 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1002 );
1003
1004 return const_reg;
1005 }
1006
1007
1008
1009 /* TODO: relative addressing!
1010 */
1011 static struct brw_reg get_reg( struct brw_vs_compile *c,
1012 gl_register_file file,
1013 GLuint index )
1014 {
1015 switch (file) {
1016 case PROGRAM_TEMPORARY:
1017 case PROGRAM_INPUT:
1018 case PROGRAM_OUTPUT:
1019 assert(c->regs[file][index].nr != 0);
1020 return c->regs[file][index];
1021 case PROGRAM_STATE_VAR:
1022 case PROGRAM_CONSTANT:
1023 case PROGRAM_UNIFORM:
1024 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1025 return c->regs[PROGRAM_STATE_VAR][index];
1026 case PROGRAM_ADDRESS:
1027 assert(index == 0);
1028 return c->regs[file][index];
1029
1030 case PROGRAM_UNDEFINED: /* undef values */
1031 return brw_null_reg();
1032
1033 case PROGRAM_LOCAL_PARAM:
1034 case PROGRAM_ENV_PARAM:
1035 case PROGRAM_WRITE_ONLY:
1036 default:
1037 assert(0);
1038 return brw_null_reg();
1039 }
1040 }
1041
1042
1043 /**
1044 * Indirect addressing: get reg[[arg] + offset].
1045 */
1046 static struct brw_reg deref( struct brw_vs_compile *c,
1047 struct brw_reg arg,
1048 GLint offset,
1049 GLuint reg_size )
1050 {
1051 struct brw_compile *p = &c->func;
1052 struct brw_reg tmp = get_tmp(c);
1053 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1054 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1055 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1056 struct brw_reg indirect = brw_vec4_indirect(0,0);
1057 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1058
1059 /* Set the vertical stride on the register access so that the first
1060 * 4 components come from a0.0 and the second 4 from a0.1.
1061 */
1062 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1063
1064 {
1065 brw_push_insn_state(p);
1066 brw_set_access_mode(p, BRW_ALIGN_1);
1067
1068 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1069 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1070
1071 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1072 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1073
1074 brw_MOV(p, tmp, indirect);
1075
1076 brw_pop_insn_state(p);
1077 }
1078
1079 /* NOTE: tmp not released */
1080 return tmp;
1081 }
1082
1083 static void
1084 move_to_reladdr_dst(struct brw_vs_compile *c,
1085 const struct prog_instruction *inst,
1086 struct brw_reg val)
1087 {
1088 struct brw_compile *p = &c->func;
1089 int reg_size = 32;
1090 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1091 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1092 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1093 GLuint byte_offset = base.nr * 32 + base.subnr;
1094 struct brw_reg indirect = brw_vec4_indirect(0,0);
1095 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1096
1097 /* Because destination register indirect addressing can only use
1098 * one index, we'll write each vertex's vec4 value separately.
1099 */
1100 val.width = BRW_WIDTH_4;
1101 val.vstride = BRW_VERTICAL_STRIDE_4;
1102
1103 brw_push_insn_state(p);
1104 brw_set_access_mode(p, BRW_ALIGN_1);
1105
1106 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1107 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1108 brw_MOV(p, indirect, val);
1109
1110 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1111 brw_ADD(p, brw_address_reg(0), acc,
1112 brw_imm_uw(byte_offset + reg_size / 2));
1113 brw_MOV(p, indirect, suboffset(val, 4));
1114
1115 brw_pop_insn_state(p);
1116 }
1117
1118 /**
1119 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1120 * TODO: relative addressing!
1121 */
1122 static struct brw_reg
1123 get_src_reg( struct brw_vs_compile *c,
1124 const struct prog_instruction *inst,
1125 GLuint argIndex )
1126 {
1127 const GLuint file = inst->SrcReg[argIndex].File;
1128 const GLint index = inst->SrcReg[argIndex].Index;
1129 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1130
1131 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1132 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1133
1134 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1135 SWIZZLE_ZERO,
1136 SWIZZLE_ZERO,
1137 SWIZZLE_ZERO)) {
1138 return brw_imm_f(0.0f);
1139 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1140 SWIZZLE_ONE,
1141 SWIZZLE_ONE,
1142 SWIZZLE_ONE)) {
1143 if (src->Negate)
1144 return brw_imm_f(-1.0F);
1145 else
1146 return brw_imm_f(1.0F);
1147 } else if (src->File == PROGRAM_CONSTANT) {
1148 const struct gl_program_parameter_list *params;
1149 float f;
1150 int component = -1;
1151
1152 switch (src->Swizzle) {
1153 case SWIZZLE_XXXX:
1154 component = 0;
1155 break;
1156 case SWIZZLE_YYYY:
1157 component = 1;
1158 break;
1159 case SWIZZLE_ZZZZ:
1160 component = 2;
1161 break;
1162 case SWIZZLE_WWWW:
1163 component = 3;
1164 break;
1165 }
1166
1167 if (component >= 0) {
1168 params = c->vp->program.Base.Parameters;
1169 f = params->ParameterValues[src->Index][component];
1170
1171 if (src->Abs)
1172 f = fabs(f);
1173 if (src->Negate)
1174 f = -f;
1175 return brw_imm_f(f);
1176 }
1177 }
1178 }
1179
1180 switch (file) {
1181 case PROGRAM_TEMPORARY:
1182 case PROGRAM_INPUT:
1183 case PROGRAM_OUTPUT:
1184 if (relAddr) {
1185 return deref(c, c->regs[file][0], index, 32);
1186 }
1187 else {
1188 assert(c->regs[file][index].nr != 0);
1189 return c->regs[file][index];
1190 }
1191
1192 case PROGRAM_STATE_VAR:
1193 case PROGRAM_CONSTANT:
1194 case PROGRAM_UNIFORM:
1195 case PROGRAM_ENV_PARAM:
1196 case PROGRAM_LOCAL_PARAM:
1197 if (c->vp->use_const_buffer) {
1198 if (!relAddr && c->constant_map[index] != -1) {
1199 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1200 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1201 } else if (relAddr)
1202 return get_reladdr_constant(c, inst, argIndex);
1203 else
1204 return get_constant(c, inst, argIndex);
1205 }
1206 else if (relAddr) {
1207 return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
1208 }
1209 else {
1210 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1211 return c->regs[PROGRAM_STATE_VAR][index];
1212 }
1213 case PROGRAM_ADDRESS:
1214 assert(index == 0);
1215 return c->regs[file][index];
1216
1217 case PROGRAM_UNDEFINED:
1218 /* this is a normal case since we loop over all three src args */
1219 return brw_null_reg();
1220
1221 case PROGRAM_WRITE_ONLY:
1222 default:
1223 assert(0);
1224 return brw_null_reg();
1225 }
1226 }
1227
1228 /**
1229 * Return the brw reg for the given instruction's src argument.
1230 * Will return mangled results for SWZ op. The emit_swz() function
1231 * ignores this result and recalculates taking extended swizzles into
1232 * account.
1233 */
1234 static struct brw_reg get_arg( struct brw_vs_compile *c,
1235 const struct prog_instruction *inst,
1236 GLuint argIndex )
1237 {
1238 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1239 struct brw_reg reg;
1240
1241 if (src->File == PROGRAM_UNDEFINED)
1242 return brw_null_reg();
1243
1244 reg = get_src_reg(c, inst, argIndex);
1245
1246 /* Convert 3-bit swizzle to 2-bit.
1247 */
1248 if (reg.file != BRW_IMMEDIATE_VALUE) {
1249 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1250 GET_SWZ(src->Swizzle, 1),
1251 GET_SWZ(src->Swizzle, 2),
1252 GET_SWZ(src->Swizzle, 3));
1253 }
1254
1255 /* Note this is ok for non-swizzle instructions:
1256 */
1257 reg.negate = src->Negate ? 1 : 0;
1258
1259 return reg;
1260 }
1261
1262
1263 /**
1264 * Get brw register for the given program dest register.
1265 */
1266 static struct brw_reg get_dst( struct brw_vs_compile *c,
1267 struct prog_dst_register dst )
1268 {
1269 struct brw_reg reg;
1270
1271 switch (dst.File) {
1272 case PROGRAM_TEMPORARY:
1273 case PROGRAM_OUTPUT:
1274 /* register-indirect addressing is only 1x1, not VxH, for
1275 * destination regs. So, for RelAddr we'll return a temporary
1276 * for the dest and do a move of the result to the RelAddr
1277 * register after the instruction emit.
1278 */
1279 if (dst.RelAddr) {
1280 reg = get_tmp(c);
1281 } else {
1282 assert(c->regs[dst.File][dst.Index].nr != 0);
1283 reg = c->regs[dst.File][dst.Index];
1284 }
1285 break;
1286 case PROGRAM_ADDRESS:
1287 assert(dst.Index == 0);
1288 reg = c->regs[dst.File][dst.Index];
1289 break;
1290 case PROGRAM_UNDEFINED:
1291 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1292 reg = brw_null_reg();
1293 break;
1294 default:
1295 assert(0);
1296 reg = brw_null_reg();
1297 }
1298
1299 assert(reg.type != BRW_IMMEDIATE_VALUE);
1300 reg.dw1.bits.writemask = dst.WriteMask;
1301
1302 return reg;
1303 }
1304
1305
1306 static void emit_swz( struct brw_vs_compile *c,
1307 struct brw_reg dst,
1308 const struct prog_instruction *inst)
1309 {
1310 const GLuint argIndex = 0;
1311 const struct prog_src_register src = inst->SrcReg[argIndex];
1312 struct brw_compile *p = &c->func;
1313 GLuint zeros_mask = 0;
1314 GLuint ones_mask = 0;
1315 GLuint src_mask = 0;
1316 GLubyte src_swz[4];
1317 GLboolean need_tmp = (src.Negate &&
1318 dst.file != BRW_GENERAL_REGISTER_FILE);
1319 struct brw_reg tmp = dst;
1320 GLuint i;
1321
1322 if (need_tmp)
1323 tmp = get_tmp(c);
1324
1325 for (i = 0; i < 4; i++) {
1326 if (dst.dw1.bits.writemask & (1<<i)) {
1327 GLubyte s = GET_SWZ(src.Swizzle, i);
1328 switch (s) {
1329 case SWIZZLE_X:
1330 case SWIZZLE_Y:
1331 case SWIZZLE_Z:
1332 case SWIZZLE_W:
1333 src_mask |= 1<<i;
1334 src_swz[i] = s;
1335 break;
1336 case SWIZZLE_ZERO:
1337 zeros_mask |= 1<<i;
1338 break;
1339 case SWIZZLE_ONE:
1340 ones_mask |= 1<<i;
1341 break;
1342 }
1343 }
1344 }
1345
1346 /* Do src first, in case dst aliases src:
1347 */
1348 if (src_mask) {
1349 struct brw_reg arg0;
1350
1351 arg0 = get_src_reg(c, inst, argIndex);
1352
1353 arg0 = brw_swizzle(arg0,
1354 src_swz[0], src_swz[1],
1355 src_swz[2], src_swz[3]);
1356
1357 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1358 }
1359
1360 if (zeros_mask)
1361 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1362
1363 if (ones_mask)
1364 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1365
1366 if (src.Negate)
1367 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1368
1369 if (need_tmp) {
1370 brw_MOV(p, dst, tmp);
1371 release_tmp(c, tmp);
1372 }
1373 }
1374
1375
1376 /**
1377 * Post-vertex-program processing. Send the results to the URB.
1378 */
1379 static void emit_vertex_write( struct brw_vs_compile *c)
1380 {
1381 struct brw_compile *p = &c->func;
1382 struct brw_context *brw = p->brw;
1383 struct intel_context *intel = &brw->intel;
1384 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1385 struct brw_reg ndc;
1386 int eot;
1387 GLuint len_vertex_header = 2;
1388 int next_mrf, i;
1389
1390 if (c->key.copy_edgeflag) {
1391 brw_MOV(p,
1392 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1393 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1394 }
1395
1396 if (intel->gen < 6) {
1397 /* Build ndc coords */
1398 ndc = get_tmp(c);
1399 /* ndc = 1.0 / pos.w */
1400 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1401 /* ndc.xyz = pos * ndc */
1402 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1403 }
1404
1405 /* Update the header for point size, user clipping flags, and -ve rhw
1406 * workaround.
1407 */
1408 if (intel->gen >= 6) {
1409 struct brw_reg m1 = brw_message_reg(1);
1410
1411 /* On gen6, m1 has each value in a separate dword, so we never
1412 * need to mess with a temporary for computing the m1 value.
1413 */
1414 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1415 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1416 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1417 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1418 }
1419
1420 /* Set the user clip distances in dword 8-15. (m3-4)*/
1421 if (c->key.nr_userclip) {
1422 for (i = 0; i < c->key.nr_userclip; i++) {
1423 struct brw_reg m;
1424 if (i < 4)
1425 m = brw_message_reg(3);
1426 else
1427 m = brw_message_reg(4);
1428
1429 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1430 }
1431 }
1432 } else if ((c->prog_data.outputs_written &
1433 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1434 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1435 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1436 GLuint i;
1437
1438 brw_MOV(p, header1, brw_imm_ud(0));
1439
1440 brw_set_access_mode(p, BRW_ALIGN_16);
1441
1442 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1443 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1444 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1445 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1446 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1447 header1, brw_imm_ud(0x7ff<<8));
1448 }
1449
1450 for (i = 0; i < c->key.nr_userclip; i++) {
1451 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1452 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1453 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1454 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1455 }
1456
1457 /* i965 clipping workaround:
1458 * 1) Test for -ve rhw
1459 * 2) If set,
1460 * set ndc = (0,0,0,0)
1461 * set ucp[6] = 1
1462 *
1463 * Later, clipping will detect ucp[6] and ensure the primitive is
1464 * clipped against all fixed planes.
1465 */
1466 if (brw->has_negative_rhw_bug) {
1467 brw_CMP(p,
1468 vec8(brw_null_reg()),
1469 BRW_CONDITIONAL_L,
1470 brw_swizzle1(ndc, 3),
1471 brw_imm_f(0));
1472
1473 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1474 brw_MOV(p, ndc, brw_imm_f(0));
1475 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1476 }
1477
1478 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1479 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1480 brw_set_access_mode(p, BRW_ALIGN_16);
1481
1482 release_tmp(c, header1);
1483 }
1484 else {
1485 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1486 }
1487
1488 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1489 * of zeros followed by two sets of NDC coordinates:
1490 */
1491 brw_set_access_mode(p, BRW_ALIGN_1);
1492 brw_set_acc_write_control(p, 0);
1493
1494 /* The VUE layout is documented in Volume 2a. */
1495 if (intel->gen >= 6) {
1496 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1497 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1498 * dword 4-7 (m2) is the 4D space position
1499 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1500 * enabled.
1501 * m3 or 5 is the first vertex element data we fill, which is
1502 * the vertex position.
1503 */
1504 brw_MOV(p, brw_message_reg(2), pos);
1505 len_vertex_header = 1;
1506 if (c->key.nr_userclip > 0)
1507 len_vertex_header += 2;
1508 } else if (intel->gen == 5) {
1509 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1510 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1511 * dword 4-7 (m2) is the ndc position (set above)
1512 * dword 8-11 (m3) of the vertex header is the 4D space position
1513 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1514 * m6 is a pad so that the vertex element data is aligned
1515 * m7 is the first vertex data we fill, which is the vertex position.
1516 */
1517 brw_MOV(p, brw_message_reg(2), ndc);
1518 brw_MOV(p, brw_message_reg(3), pos);
1519 brw_MOV(p, brw_message_reg(7), pos);
1520 len_vertex_header = 6;
1521 } else {
1522 /* There are 8 dwords in VUE header pre-Ironlake:
1523 * dword 0-3 (m1) is indices, point width, clip flags.
1524 * dword 4-7 (m2) is ndc position (set above)
1525 *
1526 * dword 8-11 (m3) is the first vertex data, which we always have be the
1527 * vertex position.
1528 */
1529 brw_MOV(p, brw_message_reg(2), ndc);
1530 brw_MOV(p, brw_message_reg(3), pos);
1531 len_vertex_header = 2;
1532 }
1533
1534 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1535 next_mrf = 2 + len_vertex_header;
1536 for (i = 0; i < VERT_RESULT_MAX; i++) {
1537 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1538 break;
1539 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1540 continue;
1541
1542 if (i >= VERT_RESULT_TEX0 &&
1543 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1544 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1545 next_mrf++;
1546 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1547 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1548 }
1549 }
1550
1551 eot = (c->first_overflow_output == 0);
1552
1553 brw_urb_WRITE(p,
1554 brw_null_reg(), /* dest */
1555 0, /* starting mrf reg nr */
1556 c->r0, /* src */
1557 0, /* allocate */
1558 1, /* used */
1559 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1560 0, /* response len */
1561 eot, /* eot */
1562 eot, /* writes complete */
1563 0, /* urb destination offset */
1564 BRW_URB_SWIZZLE_INTERLEAVE);
1565
1566 if (c->first_overflow_output > 0) {
1567 /* Not all of the vertex outputs/results fit into the MRF.
1568 * Move the overflowed attributes from the GRF to the MRF and
1569 * issue another brw_urb_WRITE().
1570 */
1571 GLuint i, mrf = 1;
1572 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1573 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1574 /* move from GRF to MRF */
1575 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1576 mrf++;
1577 }
1578 }
1579
1580 brw_urb_WRITE(p,
1581 brw_null_reg(), /* dest */
1582 0, /* starting mrf reg nr */
1583 c->r0, /* src */
1584 0, /* allocate */
1585 1, /* used */
1586 mrf, /* msg len */
1587 0, /* response len */
1588 1, /* eot */
1589 1, /* writes complete */
1590 14 / 2, /* urb destination offset */
1591 BRW_URB_SWIZZLE_INTERLEAVE);
1592 }
1593 }
1594
1595 static GLboolean
1596 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1597 {
1598 struct brw_compile *p = &c->func;
1599 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1600
1601 if (p->nr_insn == 0)
1602 return GL_FALSE;
1603
1604 if (val.address_mode != BRW_ADDRESS_DIRECT)
1605 return GL_FALSE;
1606
1607 switch (prev_insn->header.opcode) {
1608 case BRW_OPCODE_MOV:
1609 case BRW_OPCODE_MAC:
1610 case BRW_OPCODE_MUL:
1611 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1612 prev_insn->header.execution_size == val.width &&
1613 prev_insn->bits1.da1.dest_reg_file == val.file &&
1614 prev_insn->bits1.da1.dest_reg_type == val.type &&
1615 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1616 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1617 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1618 prev_insn->bits1.da16.dest_writemask == 0xf)
1619 return GL_TRUE;
1620 else
1621 return GL_FALSE;
1622 default:
1623 return GL_FALSE;
1624 }
1625 }
1626
1627 static uint32_t
1628 get_predicate(const struct prog_instruction *inst)
1629 {
1630 if (inst->DstReg.CondMask == COND_TR)
1631 return BRW_PREDICATE_NONE;
1632
1633 /* All of GLSL only produces predicates for COND_NE and one channel per
1634 * vector. Fail badly if someone starts doing something else, as it might
1635 * mean infinite looping or something.
1636 *
1637 * We'd like to support all the condition codes, but our hardware doesn't
1638 * quite match the Mesa IR, which is modeled after the NV extensions. For
1639 * those, the instruction may update the condition codes or not, then any
1640 * later instruction may use one of those condition codes. For gen4, the
1641 * instruction may update the flags register based on one of the condition
1642 * codes output by the instruction, and then further instructions may
1643 * predicate on that. We can probably support this, but it won't
1644 * necessarily be easy.
1645 */
1646 assert(inst->DstReg.CondMask == COND_NE);
1647
1648 switch (inst->DstReg.CondSwizzle) {
1649 case SWIZZLE_XXXX:
1650 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1651 case SWIZZLE_YYYY:
1652 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1653 case SWIZZLE_ZZZZ:
1654 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1655 case SWIZZLE_WWWW:
1656 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1657 default:
1658 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1659 inst->DstReg.CondMask);
1660 return BRW_PREDICATE_NORMAL;
1661 }
1662 }
1663
1664 /* Emit the vertex program instructions here.
1665 */
1666 void brw_vs_emit(struct brw_vs_compile *c )
1667 {
1668 #define MAX_IF_DEPTH 32
1669 #define MAX_LOOP_DEPTH 32
1670 struct brw_compile *p = &c->func;
1671 struct brw_context *brw = p->brw;
1672 struct intel_context *intel = &brw->intel;
1673 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1674 GLuint insn, if_depth = 0, loop_depth = 0;
1675 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1676 int if_depth_in_loop[MAX_LOOP_DEPTH];
1677 const struct brw_indirect stack_index = brw_indirect(0, 0);
1678 GLuint index;
1679 GLuint file;
1680
1681 if (INTEL_DEBUG & DEBUG_VS) {
1682 printf("vs-mesa:\n");
1683 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1684 GL_TRUE);
1685 printf("\n");
1686 }
1687
1688 /* FIXME Need to fix conditional instruction to remove this */
1689 if (intel->gen >= 6)
1690 p->single_program_flow = GL_TRUE;
1691
1692 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1693 brw_set_access_mode(p, BRW_ALIGN_16);
1694 if_depth_in_loop[loop_depth] = 0;
1695
1696 brw_set_acc_write_control(p, 1);
1697
1698 for (insn = 0; insn < nr_insns; insn++) {
1699 GLuint i;
1700 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1701
1702 /* Message registers can't be read, so copy the output into GRF
1703 * register if they are used in source registers
1704 */
1705 for (i = 0; i < 3; i++) {
1706 struct prog_src_register *src = &inst->SrcReg[i];
1707 GLuint index = src->Index;
1708 GLuint file = src->File;
1709 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1710 c->output_regs[index].used_in_src = GL_TRUE;
1711 }
1712
1713 switch (inst->Opcode) {
1714 case OPCODE_CAL:
1715 case OPCODE_RET:
1716 c->needs_stack = GL_TRUE;
1717 break;
1718 default:
1719 break;
1720 }
1721 }
1722
1723 /* Static register allocation
1724 */
1725 brw_vs_alloc_regs(c);
1726
1727 if (c->needs_stack)
1728 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1729
1730 for (insn = 0; insn < nr_insns; insn++) {
1731
1732 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1733 struct brw_reg args[3], dst;
1734 GLuint i;
1735
1736 #if 0
1737 printf("%d: ", insn);
1738 _mesa_print_instruction(inst);
1739 #endif
1740
1741 /* Get argument regs. SWZ is special and does this itself.
1742 */
1743 if (inst->Opcode != OPCODE_SWZ)
1744 for (i = 0; i < 3; i++) {
1745 const struct prog_src_register *src = &inst->SrcReg[i];
1746 index = src->Index;
1747 file = src->File;
1748 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1749 args[i] = c->output_regs[index].reg;
1750 else
1751 args[i] = get_arg(c, inst, i);
1752 }
1753
1754 /* Get dest regs. Note that it is possible for a reg to be both
1755 * dst and arg, given the static allocation of registers. So
1756 * care needs to be taken emitting multi-operation instructions.
1757 */
1758 index = inst->DstReg.Index;
1759 file = inst->DstReg.File;
1760 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1761 dst = c->output_regs[index].reg;
1762 else
1763 dst = get_dst(c, inst->DstReg);
1764
1765 if (inst->SaturateMode != SATURATE_OFF) {
1766 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1767 inst->SaturateMode);
1768 }
1769
1770 switch (inst->Opcode) {
1771 case OPCODE_ABS:
1772 brw_MOV(p, dst, brw_abs(args[0]));
1773 break;
1774 case OPCODE_ADD:
1775 brw_ADD(p, dst, args[0], args[1]);
1776 break;
1777 case OPCODE_COS:
1778 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1779 break;
1780 case OPCODE_DP2:
1781 brw_DP2(p, dst, args[0], args[1]);
1782 break;
1783 case OPCODE_DP3:
1784 brw_DP3(p, dst, args[0], args[1]);
1785 break;
1786 case OPCODE_DP4:
1787 brw_DP4(p, dst, args[0], args[1]);
1788 break;
1789 case OPCODE_DPH:
1790 brw_DPH(p, dst, args[0], args[1]);
1791 break;
1792 case OPCODE_NRM3:
1793 emit_nrm(c, dst, args[0], 3);
1794 break;
1795 case OPCODE_NRM4:
1796 emit_nrm(c, dst, args[0], 4);
1797 break;
1798 case OPCODE_DST:
1799 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1800 break;
1801 case OPCODE_EXP:
1802 unalias1(c, dst, args[0], emit_exp_noalias);
1803 break;
1804 case OPCODE_EX2:
1805 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1806 break;
1807 case OPCODE_ARL:
1808 brw_RNDD(p, dst, args[0]);
1809 break;
1810 case OPCODE_FLR:
1811 brw_RNDD(p, dst, args[0]);
1812 break;
1813 case OPCODE_FRC:
1814 brw_FRC(p, dst, args[0]);
1815 break;
1816 case OPCODE_LOG:
1817 unalias1(c, dst, args[0], emit_log_noalias);
1818 break;
1819 case OPCODE_LG2:
1820 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1821 break;
1822 case OPCODE_LIT:
1823 unalias1(c, dst, args[0], emit_lit_noalias);
1824 break;
1825 case OPCODE_LRP:
1826 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1827 break;
1828 case OPCODE_MAD:
1829 if (!accumulator_contains(c, args[2]))
1830 brw_MOV(p, brw_acc_reg(), args[2]);
1831 brw_MAC(p, dst, args[0], args[1]);
1832 break;
1833 case OPCODE_CMP:
1834 emit_cmp(p, dst, args[0], args[1], args[2]);
1835 break;
1836 case OPCODE_MAX:
1837 emit_max(p, dst, args[0], args[1]);
1838 break;
1839 case OPCODE_MIN:
1840 emit_min(p, dst, args[0], args[1]);
1841 break;
1842 case OPCODE_MOV:
1843 brw_MOV(p, dst, args[0]);
1844 break;
1845 case OPCODE_MUL:
1846 brw_MUL(p, dst, args[0], args[1]);
1847 break;
1848 case OPCODE_POW:
1849 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1850 break;
1851 case OPCODE_RCP:
1852 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1853 break;
1854 case OPCODE_RSQ:
1855 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1856 break;
1857
1858 case OPCODE_SEQ:
1859 unalias2(c, dst, args[0], args[1], emit_seq);
1860 break;
1861 case OPCODE_SIN:
1862 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1863 break;
1864 case OPCODE_SNE:
1865 unalias2(c, dst, args[0], args[1], emit_sne);
1866 break;
1867 case OPCODE_SGE:
1868 unalias2(c, dst, args[0], args[1], emit_sge);
1869 break;
1870 case OPCODE_SGT:
1871 unalias2(c, dst, args[0], args[1], emit_sgt);
1872 break;
1873 case OPCODE_SLT:
1874 unalias2(c, dst, args[0], args[1], emit_slt);
1875 break;
1876 case OPCODE_SLE:
1877 unalias2(c, dst, args[0], args[1], emit_sle);
1878 break;
1879 case OPCODE_SSG:
1880 unalias1(c, dst, args[0], emit_sign);
1881 break;
1882 case OPCODE_SUB:
1883 brw_ADD(p, dst, args[0], negate(args[1]));
1884 break;
1885 case OPCODE_SWZ:
1886 /* The args[0] value can't be used here as it won't have
1887 * correctly encoded the full swizzle:
1888 */
1889 emit_swz(c, dst, inst);
1890 break;
1891 case OPCODE_TRUNC:
1892 /* round toward zero */
1893 brw_RNDZ(p, dst, args[0]);
1894 break;
1895 case OPCODE_XPD:
1896 emit_xpd(p, dst, args[0], args[1]);
1897 break;
1898 case OPCODE_IF:
1899 assert(if_depth < MAX_IF_DEPTH);
1900 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
1901 /* Note that brw_IF smashes the predicate_control field. */
1902 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
1903 if_depth_in_loop[loop_depth]++;
1904 if_depth++;
1905 break;
1906 case OPCODE_ELSE:
1907 clear_current_const(c);
1908 assert(if_depth > 0);
1909 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
1910 break;
1911 case OPCODE_ENDIF:
1912 clear_current_const(c);
1913 assert(if_depth > 0);
1914 brw_ENDIF(p, if_inst[--if_depth]);
1915 if_depth_in_loop[loop_depth]--;
1916 break;
1917 case OPCODE_BGNLOOP:
1918 clear_current_const(c);
1919 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
1920 if_depth_in_loop[loop_depth] = 0;
1921 break;
1922 case OPCODE_BRK:
1923 brw_set_predicate_control(p, get_predicate(inst));
1924 brw_BREAK(p, if_depth_in_loop[loop_depth]);
1925 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1926 break;
1927 case OPCODE_CONT:
1928 brw_set_predicate_control(p, get_predicate(inst));
1929 brw_CONT(p, if_depth_in_loop[loop_depth]);
1930 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1931 break;
1932 case OPCODE_ENDLOOP:
1933 {
1934 clear_current_const(c);
1935 struct brw_instruction *inst0, *inst1;
1936 GLuint br = 1;
1937
1938 loop_depth--;
1939
1940 if (intel->gen == 5)
1941 br = 2;
1942
1943 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
1944 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
1945 while (inst0 > loop_inst[loop_depth]) {
1946 inst0--;
1947 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
1948 inst0->bits3.if_else.jump_count == 0) {
1949 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
1950 }
1951 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
1952 inst0->bits3.if_else.jump_count == 0) {
1953 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
1954 }
1955 }
1956 }
1957 break;
1958 case OPCODE_BRA:
1959 brw_set_predicate_control(p, get_predicate(inst));
1960 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1961 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1962 break;
1963 case OPCODE_CAL:
1964 brw_set_access_mode(p, BRW_ALIGN_1);
1965 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1966 brw_set_access_mode(p, BRW_ALIGN_16);
1967 brw_ADD(p, get_addr_reg(stack_index),
1968 get_addr_reg(stack_index), brw_imm_d(4));
1969 brw_save_call(p, inst->Comment, p->nr_insn);
1970 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1971 break;
1972 case OPCODE_RET:
1973 brw_ADD(p, get_addr_reg(stack_index),
1974 get_addr_reg(stack_index), brw_imm_d(-4));
1975 brw_set_access_mode(p, BRW_ALIGN_1);
1976 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1977 brw_set_access_mode(p, BRW_ALIGN_16);
1978 break;
1979 case OPCODE_END:
1980 emit_vertex_write(c);
1981 break;
1982 case OPCODE_PRINT:
1983 /* no-op */
1984 break;
1985 case OPCODE_BGNSUB:
1986 brw_save_label(p, inst->Comment, p->nr_insn);
1987 break;
1988 case OPCODE_ENDSUB:
1989 /* no-op */
1990 break;
1991 default:
1992 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
1993 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1994 _mesa_opcode_string(inst->Opcode) :
1995 "unknown");
1996 }
1997
1998 /* Set the predication update on the last instruction of the native
1999 * instruction sequence.
2000 *
2001 * This would be problematic if it was set on a math instruction,
2002 * but that shouldn't be the case with the current GLSL compiler.
2003 */
2004 if (inst->CondUpdate) {
2005 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2006
2007 assert(hw_insn->header.destreg__conditionalmod == 0);
2008 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2009 }
2010
2011 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2012 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2013 && c->output_regs[inst->DstReg.Index].used_in_src) {
2014 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2015 }
2016
2017 /* Result color clamping.
2018 *
2019 * When destination register is an output register and
2020 * it's primary/secondary front/back color, we have to clamp
2021 * the result to [0,1]. This is done by enabling the
2022 * saturation bit for the last instruction.
2023 *
2024 * We don't use brw_set_saturate() as it modifies
2025 * p->current->header.saturate, which affects all the subsequent
2026 * instructions. Instead, we directly modify the header
2027 * of the last (already stored) instruction.
2028 */
2029 if (inst->DstReg.File == PROGRAM_OUTPUT) {
2030 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2031 || (inst->DstReg.Index == VERT_RESULT_COL1)
2032 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2033 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2034 p->store[p->nr_insn-1].header.saturate = 1;
2035 }
2036 }
2037
2038 if (inst->DstReg.RelAddr) {
2039 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2040 inst->DstReg.File == PROGRAM_OUTPUT);
2041 move_to_reladdr_dst(c, inst, dst);
2042 }
2043
2044 release_tmps(c);
2045 }
2046
2047 brw_resolve_cals(p);
2048
2049 brw_optimize(p);
2050
2051 if (INTEL_DEBUG & DEBUG_VS) {
2052 int i;
2053
2054 printf("vs-native:\n");
2055 for (i = 0; i < p->nr_insn; i++)
2056 brw_disasm(stdout, &p->store[i], intel->gen);
2057 printf("\n");
2058 }
2059 }