Merge commit 'origin/gallium-0.1' into gallium-0.2
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41
42 /* Do things as simply as possible. Allocate and populate all regs
43 * ahead of time.
44 */
45 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
46 {
47 GLuint i, reg = 0, mrf;
48 GLuint nr_params;
49
50 /* r0 -- reserved as usual
51 */
52 c->r0 = brw_vec8_grf(reg, 0); reg++;
53
54 /* User clip planes from curbe:
55 */
56 if (c->key.nr_userclip) {
57 for (i = 0; i < c->key.nr_userclip; i++) {
58 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
59 }
60
61 /* Deal with curbe alignment:
62 */
63 reg += ((6+c->key.nr_userclip+3)/4)*2;
64 }
65
66 /* Vertex program parameters from curbe:
67 */
68 nr_params = c->vp->program.Base.Parameters->NumParameters;
69 for (i = 0; i < nr_params; i++) {
70 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
71 }
72 reg += (nr_params+1)/2;
73
74 c->prog_data.curb_read_length = reg - 1;
75
76
77
78 /* Allocate input regs:
79 */
80 c->nr_inputs = 0;
81 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
82 if (c->prog_data.inputs_read & (1<<i)) {
83 c->nr_inputs++;
84 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
85 reg++;
86 }
87 }
88
89
90 /* Allocate outputs: TODO: could organize the non-position outputs
91 * to go straight into message regs.
92 */
93 c->nr_outputs = 0;
94 c->first_output = reg;
95 mrf = 4;
96 for (i = 0; i < VERT_RESULT_MAX; i++) {
97 if (c->prog_data.outputs_written & (1<<i)) {
98 c->nr_outputs++;
99 if (i == VERT_RESULT_HPOS) {
100 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
101 reg++;
102 }
103 else if (i == VERT_RESULT_PSIZ) {
104 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
105 reg++;
106 mrf++; /* just a placeholder? XXX fix later stages & remove this */
107 }
108 else {
109 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
110 mrf++;
111 }
112 }
113 }
114
115 /* Allocate program temporaries:
116 */
117 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
118 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
119 reg++;
120 }
121
122 /* Address reg(s). Don't try to use the internal address reg until
123 * deref time.
124 */
125 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
126 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
127 reg,
128 0,
129 BRW_REGISTER_TYPE_D,
130 BRW_VERTICAL_STRIDE_8,
131 BRW_WIDTH_8,
132 BRW_HORIZONTAL_STRIDE_1,
133 BRW_SWIZZLE_XXXX,
134 WRITEMASK_X);
135 reg++;
136 }
137
138 for (i = 0; i < 128; i++) {
139 if (c->output_regs[i].used_in_src) {
140 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
141 reg++;
142 }
143 }
144
145 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
146 reg += 2;
147
148
149 /* Some opcodes need an internal temporary:
150 */
151 c->first_tmp = reg;
152 c->last_tmp = reg; /* for allocation purposes */
153
154 /* Each input reg holds data from two vertices. The
155 * urb_read_length is the number of registers read from *each*
156 * vertex urb, so is half the amount:
157 */
158 c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
159
160 c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
161 c->prog_data.total_grf = reg;
162 }
163
164
165 static struct brw_reg get_tmp( struct brw_vs_compile *c )
166 {
167 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
168
169 if (++c->last_tmp > c->prog_data.total_grf)
170 c->prog_data.total_grf = c->last_tmp;
171
172 return tmp;
173 }
174
175 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
176 {
177 if (tmp.nr == c->last_tmp-1)
178 c->last_tmp--;
179 }
180
181 static void release_tmps( struct brw_vs_compile *c )
182 {
183 c->last_tmp = c->first_tmp;
184 }
185
186
187 static void unalias1( struct brw_vs_compile *c,
188 struct brw_reg dst,
189 struct brw_reg arg0,
190 void (*func)( struct brw_vs_compile *,
191 struct brw_reg,
192 struct brw_reg ))
193 {
194 if (dst.file == arg0.file && dst.nr == arg0.nr) {
195 struct brw_compile *p = &c->func;
196 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
197 func(c, tmp, arg0);
198 brw_MOV(p, dst, tmp);
199 }
200 else {
201 func(c, dst, arg0);
202 }
203 }
204
205 static void unalias2( struct brw_vs_compile *c,
206 struct brw_reg dst,
207 struct brw_reg arg0,
208 struct brw_reg arg1,
209 void (*func)( struct brw_vs_compile *,
210 struct brw_reg,
211 struct brw_reg,
212 struct brw_reg ))
213 {
214 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
215 (dst.file == arg1.file && dst.nr == arg1.nr)) {
216 struct brw_compile *p = &c->func;
217 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
218 func(c, tmp, arg0, arg1);
219 brw_MOV(p, dst, tmp);
220 }
221 else {
222 func(c, dst, arg0, arg1);
223 }
224 }
225
226 static void emit_sop( struct brw_compile *p,
227 struct brw_reg dst,
228 struct brw_reg arg0,
229 struct brw_reg arg1,
230 GLuint cond)
231 {
232 brw_MOV(p, dst, brw_imm_f(0.0f));
233 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
234 brw_MOV(p, dst, brw_imm_f(1.0f));
235 brw_set_predicate_control_flag_value(p, 0xff);
236 }
237
238 static void emit_seq( struct brw_compile *p,
239 struct brw_reg dst,
240 struct brw_reg arg0,
241 struct brw_reg arg1 )
242 {
243 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
244 }
245
246 static void emit_sne( struct brw_compile *p,
247 struct brw_reg dst,
248 struct brw_reg arg0,
249 struct brw_reg arg1 )
250 {
251 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
252 }
253 static void emit_slt( struct brw_compile *p,
254 struct brw_reg dst,
255 struct brw_reg arg0,
256 struct brw_reg arg1 )
257 {
258 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
259 }
260
261 static void emit_sle( struct brw_compile *p,
262 struct brw_reg dst,
263 struct brw_reg arg0,
264 struct brw_reg arg1 )
265 {
266 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
267 }
268
269 static void emit_sgt( struct brw_compile *p,
270 struct brw_reg dst,
271 struct brw_reg arg0,
272 struct brw_reg arg1 )
273 {
274 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
275 }
276
277 static void emit_sge( struct brw_compile *p,
278 struct brw_reg dst,
279 struct brw_reg arg0,
280 struct brw_reg arg1 )
281 {
282 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
283 }
284
285 static void emit_max( struct brw_compile *p,
286 struct brw_reg dst,
287 struct brw_reg arg0,
288 struct brw_reg arg1 )
289 {
290 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
291 brw_SEL(p, dst, arg1, arg0);
292 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
293 }
294
295 static void emit_min( struct brw_compile *p,
296 struct brw_reg dst,
297 struct brw_reg arg0,
298 struct brw_reg arg1 )
299 {
300 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
301 brw_SEL(p, dst, arg0, arg1);
302 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
303 }
304
305
306 static void emit_math1( struct brw_vs_compile *c,
307 GLuint function,
308 struct brw_reg dst,
309 struct brw_reg arg0,
310 GLuint precision)
311 {
312 /* There are various odd behaviours with SEND on the simulator. In
313 * addition there are documented issues with the fact that the GEN4
314 * processor doesn't do dependency control properly on SEND
315 * results. So, on balance, this kludge to get around failures
316 * with writemasked math results looks like it might be necessary
317 * whether that turns out to be a simulator bug or not:
318 */
319 struct brw_compile *p = &c->func;
320 struct brw_reg tmp = dst;
321 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
322 dst.file != BRW_GENERAL_REGISTER_FILE);
323
324 if (need_tmp)
325 tmp = get_tmp(c);
326
327 brw_math(p,
328 tmp,
329 function,
330 BRW_MATH_SATURATE_NONE,
331 2,
332 arg0,
333 BRW_MATH_DATA_SCALAR,
334 precision);
335
336 if (need_tmp) {
337 brw_MOV(p, dst, tmp);
338 release_tmp(c, tmp);
339 }
340 }
341
342 static void emit_math2( struct brw_vs_compile *c,
343 GLuint function,
344 struct brw_reg dst,
345 struct brw_reg arg0,
346 struct brw_reg arg1,
347 GLuint precision)
348 {
349 struct brw_compile *p = &c->func;
350 struct brw_reg tmp = dst;
351 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
352 dst.file != BRW_GENERAL_REGISTER_FILE);
353
354 if (need_tmp)
355 tmp = get_tmp(c);
356
357 brw_MOV(p, brw_message_reg(3), arg1);
358
359 brw_math(p,
360 tmp,
361 function,
362 BRW_MATH_SATURATE_NONE,
363 2,
364 arg0,
365 BRW_MATH_DATA_SCALAR,
366 precision);
367
368 if (need_tmp) {
369 brw_MOV(p, dst, tmp);
370 release_tmp(c, tmp);
371 }
372 }
373
374
375
376 static void emit_exp_noalias( struct brw_vs_compile *c,
377 struct brw_reg dst,
378 struct brw_reg arg0 )
379 {
380 struct brw_compile *p = &c->func;
381
382
383 if (dst.dw1.bits.writemask & WRITEMASK_X) {
384 struct brw_reg tmp = get_tmp(c);
385 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
386
387 /* tmp_d = floor(arg0.x) */
388 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
389
390 /* result[0] = 2.0 ^ tmp */
391
392 /* Adjust exponent for floating point:
393 * exp += 127
394 */
395 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
396
397 /* Install exponent and sign.
398 * Excess drops off the edge:
399 */
400 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
401 tmp_d, brw_imm_d(23));
402
403 release_tmp(c, tmp);
404 }
405
406 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
407 /* result[1] = arg0.x - floor(arg0.x) */
408 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
409 }
410
411 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
412 /* As with the LOG instruction, we might be better off just
413 * doing a taylor expansion here, seeing as we have to do all
414 * the prep work.
415 *
416 * If mathbox partial precision is too low, consider also:
417 * result[3] = result[0] * EXP(result[1])
418 */
419 emit_math1(c,
420 BRW_MATH_FUNCTION_EXP,
421 brw_writemask(dst, WRITEMASK_Z),
422 brw_swizzle1(arg0, 0),
423 BRW_MATH_PRECISION_PARTIAL);
424 }
425
426 if (dst.dw1.bits.writemask & WRITEMASK_W) {
427 /* result[3] = 1.0; */
428 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
429 }
430 }
431
432
433 static void emit_log_noalias( struct brw_vs_compile *c,
434 struct brw_reg dst,
435 struct brw_reg arg0 )
436 {
437 struct brw_compile *p = &c->func;
438 struct brw_reg tmp = dst;
439 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
440 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
441 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
442 dst.file != BRW_GENERAL_REGISTER_FILE);
443
444 if (need_tmp) {
445 tmp = get_tmp(c);
446 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
447 }
448
449 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
450 * according to spec:
451 *
452 * These almost look likey they could be joined up, but not really
453 * practical:
454 *
455 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
456 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
457 */
458 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
459 brw_AND(p,
460 brw_writemask(tmp_ud, WRITEMASK_X),
461 brw_swizzle1(arg0_ud, 0),
462 brw_imm_ud((1U<<31)-1));
463
464 brw_SHR(p,
465 brw_writemask(tmp_ud, WRITEMASK_X),
466 tmp_ud,
467 brw_imm_ud(23));
468
469 brw_ADD(p,
470 brw_writemask(tmp, WRITEMASK_X),
471 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
472 brw_imm_d(-127));
473 }
474
475 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
476 brw_AND(p,
477 brw_writemask(tmp_ud, WRITEMASK_Y),
478 brw_swizzle1(arg0_ud, 0),
479 brw_imm_ud((1<<23)-1));
480
481 brw_OR(p,
482 brw_writemask(tmp_ud, WRITEMASK_Y),
483 tmp_ud,
484 brw_imm_ud(127<<23));
485 }
486
487 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
488 /* result[2] = result[0] + LOG2(result[1]); */
489
490 /* Why bother? The above is just a hint how to do this with a
491 * taylor series. Maybe we *should* use a taylor series as by
492 * the time all the above has been done it's almost certainly
493 * quicker than calling the mathbox, even with low precision.
494 *
495 * Options are:
496 * - result[0] + mathbox.LOG2(result[1])
497 * - mathbox.LOG2(arg0.x)
498 * - result[0] + inline_taylor_approx(result[1])
499 */
500 emit_math1(c,
501 BRW_MATH_FUNCTION_LOG,
502 brw_writemask(tmp, WRITEMASK_Z),
503 brw_swizzle1(tmp, 1),
504 BRW_MATH_PRECISION_FULL);
505
506 brw_ADD(p,
507 brw_writemask(tmp, WRITEMASK_Z),
508 brw_swizzle1(tmp, 2),
509 brw_swizzle1(tmp, 0));
510 }
511
512 if (dst.dw1.bits.writemask & WRITEMASK_W) {
513 /* result[3] = 1.0; */
514 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
515 }
516
517 if (need_tmp) {
518 brw_MOV(p, dst, tmp);
519 release_tmp(c, tmp);
520 }
521 }
522
523
524
525
526 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
527 */
528 static void emit_dst_noalias( struct brw_vs_compile *c,
529 struct brw_reg dst,
530 struct brw_reg arg0,
531 struct brw_reg arg1)
532 {
533 struct brw_compile *p = &c->func;
534
535 /* There must be a better way to do this:
536 */
537 if (dst.dw1.bits.writemask & WRITEMASK_X)
538 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
539 if (dst.dw1.bits.writemask & WRITEMASK_Y)
540 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
541 if (dst.dw1.bits.writemask & WRITEMASK_Z)
542 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
543 if (dst.dw1.bits.writemask & WRITEMASK_W)
544 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
545 }
546
547 static void emit_xpd( struct brw_compile *p,
548 struct brw_reg dst,
549 struct brw_reg t,
550 struct brw_reg u)
551 {
552 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
553 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
554 }
555
556
557
558 static void emit_lit_noalias( struct brw_vs_compile *c,
559 struct brw_reg dst,
560 struct brw_reg arg0 )
561 {
562 struct brw_compile *p = &c->func;
563 struct brw_instruction *if_insn;
564 struct brw_reg tmp = dst;
565 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
566
567 if (need_tmp)
568 tmp = get_tmp(c);
569
570 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
571 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
572
573 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
574 * to get all channels active inside the IF. In the clipping code
575 * we run with NoMask, so it's not an option and we can use
576 * BRW_EXECUTE_1 for all comparisions.
577 */
578 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
579 if_insn = brw_IF(p, BRW_EXECUTE_8);
580 {
581 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
582
583 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
584 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
585 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
586
587 emit_math2(c,
588 BRW_MATH_FUNCTION_POW,
589 brw_writemask(dst, WRITEMASK_Z),
590 brw_swizzle1(tmp, 2),
591 brw_swizzle1(arg0, 3),
592 BRW_MATH_PRECISION_PARTIAL);
593 }
594
595 brw_ENDIF(p, if_insn);
596 }
597
598
599
600
601
602 /* TODO: relative addressing!
603 */
604 static struct brw_reg get_reg( struct brw_vs_compile *c,
605 GLuint file,
606 GLuint index )
607 {
608
609 switch (file) {
610 case PROGRAM_TEMPORARY:
611 case PROGRAM_INPUT:
612 case PROGRAM_OUTPUT:
613 assert(c->regs[file][index].nr != 0);
614 return c->regs[file][index];
615 case PROGRAM_STATE_VAR:
616 case PROGRAM_CONSTANT:
617 case PROGRAM_UNIFORM:
618 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
619 return c->regs[PROGRAM_STATE_VAR][index];
620 case PROGRAM_ADDRESS:
621 assert(index == 0);
622 return c->regs[file][index];
623
624 case PROGRAM_UNDEFINED: /* undef values */
625 return brw_null_reg();
626
627 case PROGRAM_LOCAL_PARAM:
628 case PROGRAM_ENV_PARAM:
629 case PROGRAM_WRITE_ONLY:
630 default:
631 assert(0);
632 return brw_null_reg();
633 }
634 }
635
636
637
638 static struct brw_reg deref( struct brw_vs_compile *c,
639 struct brw_reg arg,
640 GLint offset)
641 {
642 struct brw_compile *p = &c->func;
643 struct brw_reg tmp = vec4(get_tmp(c));
644 struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
645 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
646 struct brw_reg indirect = brw_vec4_indirect(0,0);
647
648 {
649 brw_push_insn_state(p);
650 brw_set_access_mode(p, BRW_ALIGN_1);
651
652 /* This is pretty clunky - load the address register twice and
653 * fetch each 4-dword value in turn. There must be a way to do
654 * this in a single pass, but I couldn't get it to work.
655 */
656 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
657 brw_MOV(p, tmp, indirect);
658
659 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
660 brw_MOV(p, suboffset(tmp, 4), indirect);
661
662 brw_pop_insn_state(p);
663 }
664
665 return vec8(tmp);
666 }
667
668
669 static void emit_arl( struct brw_vs_compile *c,
670 struct brw_reg dst,
671 struct brw_reg arg0 )
672 {
673 struct brw_compile *p = &c->func;
674 struct brw_reg tmp = dst;
675 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
676
677 if (need_tmp)
678 tmp = get_tmp(c);
679
680 brw_RNDD(p, tmp, arg0);
681 brw_MUL(p, dst, tmp, brw_imm_d(16));
682
683 if (need_tmp)
684 release_tmp(c, tmp);
685 }
686
687
688 /* Will return mangled results for SWZ op. The emit_swz() function
689 * ignores this result and recalculates taking extended swizzles into
690 * account.
691 */
692 static struct brw_reg get_arg( struct brw_vs_compile *c,
693 struct prog_src_register *src )
694 {
695 struct brw_reg reg;
696
697 if (src->File == PROGRAM_UNDEFINED)
698 return brw_null_reg();
699
700 if (src->RelAddr)
701 reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
702 else
703 reg = get_reg(c, src->File, src->Index);
704
705 /* Convert 3-bit swizzle to 2-bit.
706 */
707 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
708 GET_SWZ(src->Swizzle, 1),
709 GET_SWZ(src->Swizzle, 2),
710 GET_SWZ(src->Swizzle, 3));
711
712 /* Note this is ok for non-swizzle instructions:
713 */
714 reg.negate = src->NegateBase ? 1 : 0;
715
716 return reg;
717 }
718
719
720 static struct brw_reg get_dst( struct brw_vs_compile *c,
721 struct prog_dst_register dst )
722 {
723 struct brw_reg reg = get_reg(c, dst.File, dst.Index);
724
725 reg.dw1.bits.writemask = dst.WriteMask;
726
727 return reg;
728 }
729
730
731
732
733 static void emit_swz( struct brw_vs_compile *c,
734 struct brw_reg dst,
735 struct prog_src_register src )
736 {
737 struct brw_compile *p = &c->func;
738 GLuint zeros_mask = 0;
739 GLuint ones_mask = 0;
740 GLuint src_mask = 0;
741 GLubyte src_swz[4];
742 GLboolean need_tmp = (src.NegateBase &&
743 dst.file != BRW_GENERAL_REGISTER_FILE);
744 struct brw_reg tmp = dst;
745 GLuint i;
746
747 if (need_tmp)
748 tmp = get_tmp(c);
749
750 for (i = 0; i < 4; i++) {
751 if (dst.dw1.bits.writemask & (1<<i)) {
752 GLubyte s = GET_SWZ(src.Swizzle, i);
753 switch (s) {
754 case SWIZZLE_X:
755 case SWIZZLE_Y:
756 case SWIZZLE_Z:
757 case SWIZZLE_W:
758 src_mask |= 1<<i;
759 src_swz[i] = s;
760 break;
761 case SWIZZLE_ZERO:
762 zeros_mask |= 1<<i;
763 break;
764 case SWIZZLE_ONE:
765 ones_mask |= 1<<i;
766 break;
767 }
768 }
769 }
770
771 /* Do src first, in case dst aliases src:
772 */
773 if (src_mask) {
774 struct brw_reg arg0;
775
776 if (src.RelAddr)
777 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
778 else
779 arg0 = get_reg(c, src.File, src.Index);
780
781 arg0 = brw_swizzle(arg0,
782 src_swz[0], src_swz[1],
783 src_swz[2], src_swz[3]);
784
785 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
786 }
787
788 if (zeros_mask)
789 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
790
791 if (ones_mask)
792 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
793
794 if (src.NegateBase)
795 brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
796
797 if (need_tmp) {
798 brw_MOV(p, dst, tmp);
799 release_tmp(c, tmp);
800 }
801 }
802
803
804
805 /* Post-vertex-program processing. Send the results to the URB.
806 */
807 static void emit_vertex_write( struct brw_vs_compile *c)
808 {
809 struct brw_compile *p = &c->func;
810 struct brw_reg m0 = brw_message_reg(0);
811 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
812 struct brw_reg ndc;
813
814 if (c->key.copy_edgeflag) {
815 brw_MOV(p,
816 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
817 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
818 }
819
820
821 /* Build ndc coords */
822 if (!c->key.know_w_is_one) {
823 ndc = get_tmp(c);
824 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
825 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
826 }
827 else {
828 ndc = pos;
829 }
830
831 /* Update the header for point size, user clipping flags, and -ve rhw
832 * workaround.
833 */
834 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
835 c->key.nr_userclip ||
836 (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one))
837 {
838 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
839 GLuint i;
840
841 brw_MOV(p, header1, brw_imm_ud(0));
842
843 brw_set_access_mode(p, BRW_ALIGN_16);
844
845 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
846 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
847 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
848 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
849 }
850
851
852 for (i = 0; i < c->key.nr_userclip; i++) {
853 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
854 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
855 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
856 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
857 }
858
859
860 /* i965 clipping workaround:
861 * 1) Test for -ve rhw
862 * 2) If set,
863 * set ndc = (0,0,0,0)
864 * set ucp[6] = 1
865 *
866 * Later, clipping will detect ucp[6] and ensure the primitive is
867 * clipped against all fixed planes.
868 */
869 if (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one) {
870 brw_CMP(p,
871 vec8(brw_null_reg()),
872 BRW_CONDITIONAL_L,
873 brw_swizzle1(ndc, 3),
874 brw_imm_f(0));
875
876 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
877 brw_MOV(p, ndc, brw_imm_f(0));
878 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
879 }
880
881 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
882 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
883 brw_set_access_mode(p, BRW_ALIGN_16);
884
885 release_tmp(c, header1);
886 }
887 else {
888 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
889 }
890
891
892 /* Emit the (interleaved) headers for the two vertices - an 8-reg
893 * of zeros followed by two sets of NDC coordinates:
894 */
895 brw_set_access_mode(p, BRW_ALIGN_1);
896 brw_MOV(p, offset(m0, 2), ndc);
897 brw_MOV(p, offset(m0, 3), pos);
898
899
900 brw_urb_WRITE(p,
901 brw_null_reg(), /* dest */
902 0, /* starting mrf reg nr */
903 c->r0, /* src */
904 0, /* allocate */
905 1, /* used */
906 c->nr_outputs + 3, /* msg len */
907 0, /* response len */
908 1, /* eot */
909 1, /* writes complete */
910 0, /* urb destination offset */
911 BRW_URB_SWIZZLE_INTERLEAVE);
912
913 }
914
915 static void
916 post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
917 {
918 GLuint nr_insns = c->vp->program.Base.NumInstructions;
919 GLuint insn, target_insn;
920 struct prog_instruction *inst1, *inst2;
921 struct brw_instruction *brw_inst1, *brw_inst2;
922 int offset;
923 for (insn = 0; insn < nr_insns; insn++) {
924 inst1 = &c->vp->program.Base.Instructions[insn];
925 brw_inst1 = inst1->Data;
926 switch (inst1->Opcode) {
927 case OPCODE_CAL:
928 case OPCODE_BRA:
929 target_insn = inst1->BranchTarget;
930 inst2 = &c->vp->program.Base.Instructions[target_insn];
931 brw_inst2 = inst2->Data;
932 offset = brw_inst2 - brw_inst1;
933 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
934 break;
935 case OPCODE_END:
936 offset = end_inst - brw_inst1;
937 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
938 break;
939 default:
940 break;
941 }
942 }
943 }
944
945 /* Emit the fragment program instructions here.
946 */
947 void brw_vs_emit(struct brw_vs_compile *c )
948 {
949 #define MAX_IFSN 32
950 struct brw_compile *p = &c->func;
951 GLuint nr_insns = c->vp->program.Base.NumInstructions;
952 GLuint insn, if_insn = 0;
953 struct brw_instruction *end_inst;
954 struct brw_instruction *if_inst[MAX_IFSN];
955 struct brw_indirect stack_index = brw_indirect(0, 0);
956
957 GLuint index;
958 GLuint file;
959
960 if (INTEL_DEBUG & DEBUG_VS) {
961 _mesa_printf("\n\n\nvs-emit:\n");
962 _mesa_print_program(&c->vp->program.Base);
963 _mesa_printf("\n");
964 }
965
966 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
967 brw_set_access_mode(p, BRW_ALIGN_16);
968
969 /* Message registers can't be read, so copy the output into GRF register
970 if they are used in source registers */
971 for (insn = 0; insn < nr_insns; insn++) {
972 GLuint i;
973 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
974 for (i = 0; i < 3; i++) {
975 struct prog_src_register *src = &inst->SrcReg[i];
976 GLuint index = src->Index;
977 GLuint file = src->File;
978 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
979 c->output_regs[index].used_in_src = GL_TRUE;
980 }
981 }
982
983 /* Static register allocation
984 */
985 brw_vs_alloc_regs(c);
986 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
987
988 for (insn = 0; insn < nr_insns; insn++) {
989
990 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
991 struct brw_reg args[3], dst;
992 GLuint i;
993
994 /* Get argument regs. SWZ is special and does this itself.
995 */
996 inst->Data = &p->store[p->nr_insn];
997 if (inst->Opcode != OPCODE_SWZ)
998 for (i = 0; i < 3; i++) {
999 struct prog_src_register *src = &inst->SrcReg[i];
1000 index = src->Index;
1001 file = src->File;
1002 if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
1003 args[i] = c->output_regs[index].reg;
1004 else
1005 args[i] = get_arg(c, src);
1006 }
1007
1008 /* Get dest regs. Note that it is possible for a reg to be both
1009 * dst and arg, given the static allocation of registers. So
1010 * care needs to be taken emitting multi-operation instructions.
1011 */
1012 index = inst->DstReg.Index;
1013 file = inst->DstReg.File;
1014 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1015 dst = c->output_regs[index].reg;
1016 else
1017 dst = get_dst(c, inst->DstReg);
1018
1019 switch (inst->Opcode) {
1020 case OPCODE_ABS:
1021 brw_MOV(p, dst, brw_abs(args[0]));
1022 break;
1023 case OPCODE_ADD:
1024 brw_ADD(p, dst, args[0], args[1]);
1025 break;
1026 case OPCODE_COS:
1027 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1028 break;
1029 case OPCODE_DP3:
1030 brw_DP3(p, dst, args[0], args[1]);
1031 break;
1032 case OPCODE_DP4:
1033 brw_DP4(p, dst, args[0], args[1]);
1034 break;
1035 case OPCODE_DPH:
1036 brw_DPH(p, dst, args[0], args[1]);
1037 break;
1038 case OPCODE_DST:
1039 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1040 break;
1041 case OPCODE_EXP:
1042 unalias1(c, dst, args[0], emit_exp_noalias);
1043 break;
1044 case OPCODE_EX2:
1045 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1046 break;
1047 case OPCODE_ARL:
1048 emit_arl(c, dst, args[0]);
1049 break;
1050 case OPCODE_FLR:
1051 brw_RNDD(p, dst, args[0]);
1052 break;
1053 case OPCODE_FRC:
1054 brw_FRC(p, dst, args[0]);
1055 break;
1056 case OPCODE_LOG:
1057 unalias1(c, dst, args[0], emit_log_noalias);
1058 break;
1059 case OPCODE_LG2:
1060 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1061 break;
1062 case OPCODE_LIT:
1063 unalias1(c, dst, args[0], emit_lit_noalias);
1064 break;
1065 case OPCODE_MAD:
1066 brw_MOV(p, brw_acc_reg(), args[2]);
1067 brw_MAC(p, dst, args[0], args[1]);
1068 break;
1069 case OPCODE_MAX:
1070 emit_max(p, dst, args[0], args[1]);
1071 break;
1072 case OPCODE_MIN:
1073 emit_min(p, dst, args[0], args[1]);
1074 break;
1075 case OPCODE_MOV:
1076 brw_MOV(p, dst, args[0]);
1077 break;
1078 case OPCODE_MUL:
1079 brw_MUL(p, dst, args[0], args[1]);
1080 break;
1081 case OPCODE_POW:
1082 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1083 break;
1084 case OPCODE_RCP:
1085 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1086 break;
1087 case OPCODE_RSQ:
1088 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1089 break;
1090
1091 case OPCODE_SEQ:
1092 emit_seq(p, dst, args[0], args[1]);
1093 break;
1094 case OPCODE_SIN:
1095 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1096 break;
1097 case OPCODE_SNE:
1098 emit_sne(p, dst, args[0], args[1]);
1099 break;
1100 case OPCODE_SGE:
1101 emit_sge(p, dst, args[0], args[1]);
1102 break;
1103 case OPCODE_SGT:
1104 emit_sgt(p, dst, args[0], args[1]);
1105 break;
1106 case OPCODE_SLT:
1107 emit_slt(p, dst, args[0], args[1]);
1108 break;
1109 case OPCODE_SLE:
1110 emit_sle(p, dst, args[0], args[1]);
1111 break;
1112 case OPCODE_SUB:
1113 brw_ADD(p, dst, args[0], negate(args[1]));
1114 break;
1115 case OPCODE_SWZ:
1116 /* The args[0] value can't be used here as it won't have
1117 * correctly encoded the full swizzle:
1118 */
1119 emit_swz(c, dst, inst->SrcReg[0] );
1120 break;
1121 case OPCODE_XPD:
1122 emit_xpd(p, dst, args[0], args[1]);
1123 break;
1124 case OPCODE_IF:
1125 assert(if_insn < MAX_IFSN);
1126 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1127 break;
1128 case OPCODE_ELSE:
1129 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1130 break;
1131 case OPCODE_ENDIF:
1132 assert(if_insn > 0);
1133 brw_ENDIF(p, if_inst[--if_insn]);
1134 break;
1135 case OPCODE_BRA:
1136 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1137 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1138 brw_set_predicate_control_flag_value(p, 0xff);
1139 break;
1140 case OPCODE_CAL:
1141 brw_set_access_mode(p, BRW_ALIGN_1);
1142 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1143 brw_set_access_mode(p, BRW_ALIGN_16);
1144 brw_ADD(p, get_addr_reg(stack_index),
1145 get_addr_reg(stack_index), brw_imm_d(4));
1146 inst->Data = &p->store[p->nr_insn];
1147 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1148 break;
1149 case OPCODE_RET:
1150 brw_ADD(p, get_addr_reg(stack_index),
1151 get_addr_reg(stack_index), brw_imm_d(-4));
1152 brw_set_access_mode(p, BRW_ALIGN_1);
1153 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1154 brw_set_access_mode(p, BRW_ALIGN_16);
1155 case OPCODE_END:
1156 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1157 break;
1158 case OPCODE_PRINT:
1159 case OPCODE_BGNSUB:
1160 case OPCODE_ENDSUB:
1161 break;
1162 default:
1163 _mesa_printf("Unsupported opcode %i (%s) in vertex shader\n",
1164 inst->Opcode, inst->Opcode < MAX_OPCODE ?
1165 _mesa_opcode_string(inst->Opcode) :
1166 "unknown");
1167 break;
1168 }
1169
1170 if ((inst->DstReg.File == PROGRAM_OUTPUT)
1171 && (inst->DstReg.Index != VERT_RESULT_HPOS)
1172 && c->output_regs[inst->DstReg.Index].used_in_src) {
1173 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1174 }
1175
1176 /* Result color clamping.
1177 *
1178 * When destination register is an output register and
1179 * it's primary/secondary front/back color, we have to clamp
1180 * the result to [0,1]. This is done by enabling the
1181 * saturation bit for the last instruction.
1182 *
1183 * We don't use brw_set_saturate() as it modifies
1184 * p->current->header.saturate, which affects all the subsequent
1185 * instructions. Instead, we directly modify the header
1186 * of the last (already stored) instruction.
1187 */
1188 if (inst->DstReg.File == PROGRAM_OUTPUT) {
1189 if ((inst->DstReg.Index == VERT_RESULT_COL0)
1190 || (inst->DstReg.Index == VERT_RESULT_COL1)
1191 || (inst->DstReg.Index == VERT_RESULT_BFC0)
1192 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
1193 p->store[p->nr_insn-1].header.saturate = 1;
1194 }
1195 }
1196
1197 release_tmps(c);
1198 }
1199
1200 end_inst = &p->store[p->nr_insn];
1201 emit_vertex_write(c);
1202 post_vs_emit(c, end_inst);
1203 for (insn = 0; insn < nr_insns; insn++)
1204 c->vp->program.Base.Instructions[insn].Data = NULL;
1205 }