i965: official name for GM45 chipset
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41
42 /* Do things as simply as possible. Allocate and populate all regs
43 * ahead of time.
44 */
45 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
46 {
47 GLuint i, reg = 0, mrf;
48 GLuint nr_params;
49
50 /* r0 -- reserved as usual
51 */
52 c->r0 = brw_vec8_grf(reg, 0); reg++;
53
54 /* User clip planes from curbe:
55 */
56 if (c->key.nr_userclip) {
57 for (i = 0; i < c->key.nr_userclip; i++) {
58 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
59 }
60
61 /* Deal with curbe alignment:
62 */
63 reg += ((6+c->key.nr_userclip+3)/4)*2;
64 }
65
66 /* Vertex program parameters from curbe:
67 */
68 nr_params = c->vp->program.Base.Parameters->NumParameters;
69 for (i = 0; i < nr_params; i++) {
70 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
71 }
72 reg += (nr_params+1)/2;
73
74 c->prog_data.curb_read_length = reg - 1;
75
76
77
78 /* Allocate input regs:
79 */
80 c->nr_inputs = 0;
81 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
82 if (c->prog_data.inputs_read & (1<<i)) {
83 c->nr_inputs++;
84 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
85 reg++;
86 }
87 }
88
89
90 /* Allocate outputs: TODO: could organize the non-position outputs
91 * to go straight into message regs.
92 */
93 c->nr_outputs = 0;
94 c->first_output = reg;
95 mrf = 4;
96 for (i = 0; i < VERT_RESULT_MAX; i++) {
97 if (c->prog_data.outputs_written & (1<<i)) {
98 c->nr_outputs++;
99 if (i == VERT_RESULT_HPOS) {
100 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
101 reg++;
102 }
103 else if (i == VERT_RESULT_PSIZ) {
104 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
105 reg++;
106 mrf++; /* just a placeholder? XXX fix later stages & remove this */
107 }
108 else {
109 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
110 mrf++;
111 }
112 }
113 }
114
115 /* Allocate program temporaries:
116 */
117 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
118 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
119 reg++;
120 }
121
122 /* Address reg(s). Don't try to use the internal address reg until
123 * deref time.
124 */
125 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
126 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
127 reg,
128 0,
129 BRW_REGISTER_TYPE_D,
130 BRW_VERTICAL_STRIDE_8,
131 BRW_WIDTH_8,
132 BRW_HORIZONTAL_STRIDE_1,
133 BRW_SWIZZLE_XXXX,
134 WRITEMASK_X);
135 reg++;
136 }
137
138 for (i = 0; i < 128; i++) {
139 if (c->output_regs[i].used_in_src) {
140 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
141 reg++;
142 }
143 }
144
145 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
146 reg += 2;
147
148
149 /* Some opcodes need an internal temporary:
150 */
151 c->first_tmp = reg;
152 c->last_tmp = reg; /* for allocation purposes */
153
154 /* Each input reg holds data from two vertices. The
155 * urb_read_length is the number of registers read from *each*
156 * vertex urb, so is half the amount:
157 */
158 c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
159
160 c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
161 c->prog_data.total_grf = reg;
162 }
163
164
165 static struct brw_reg get_tmp( struct brw_vs_compile *c )
166 {
167 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
168
169 if (++c->last_tmp > c->prog_data.total_grf)
170 c->prog_data.total_grf = c->last_tmp;
171
172 return tmp;
173 }
174
175 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
176 {
177 if (tmp.nr == c->last_tmp-1)
178 c->last_tmp--;
179 }
180
181 static void release_tmps( struct brw_vs_compile *c )
182 {
183 c->last_tmp = c->first_tmp;
184 }
185
186
187 static void unalias1( struct brw_vs_compile *c,
188 struct brw_reg dst,
189 struct brw_reg arg0,
190 void (*func)( struct brw_vs_compile *,
191 struct brw_reg,
192 struct brw_reg ))
193 {
194 if (dst.file == arg0.file && dst.nr == arg0.nr) {
195 struct brw_compile *p = &c->func;
196 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
197 func(c, tmp, arg0);
198 brw_MOV(p, dst, tmp);
199 }
200 else {
201 func(c, dst, arg0);
202 }
203 }
204
205 static void unalias2( struct brw_vs_compile *c,
206 struct brw_reg dst,
207 struct brw_reg arg0,
208 struct brw_reg arg1,
209 void (*func)( struct brw_vs_compile *,
210 struct brw_reg,
211 struct brw_reg,
212 struct brw_reg ))
213 {
214 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
215 (dst.file == arg1.file && dst.nr == arg1.nr)) {
216 struct brw_compile *p = &c->func;
217 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
218 func(c, tmp, arg0, arg1);
219 brw_MOV(p, dst, tmp);
220 }
221 else {
222 func(c, dst, arg0, arg1);
223 }
224 }
225
226 static void emit_sop( struct brw_compile *p,
227 struct brw_reg dst,
228 struct brw_reg arg0,
229 struct brw_reg arg1,
230 GLuint cond)
231 {
232 brw_MOV(p, dst, brw_imm_f(0.0f));
233 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
234 brw_MOV(p, dst, brw_imm_f(1.0f));
235 brw_set_predicate_control_flag_value(p, 0xff);
236 }
237
238 static void emit_seq( struct brw_compile *p,
239 struct brw_reg dst,
240 struct brw_reg arg0,
241 struct brw_reg arg1 )
242 {
243 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
244 }
245
246 static void emit_sne( struct brw_compile *p,
247 struct brw_reg dst,
248 struct brw_reg arg0,
249 struct brw_reg arg1 )
250 {
251 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
252 }
253 static void emit_slt( struct brw_compile *p,
254 struct brw_reg dst,
255 struct brw_reg arg0,
256 struct brw_reg arg1 )
257 {
258 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
259 }
260
261 static void emit_sle( struct brw_compile *p,
262 struct brw_reg dst,
263 struct brw_reg arg0,
264 struct brw_reg arg1 )
265 {
266 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
267 }
268
269 static void emit_sgt( struct brw_compile *p,
270 struct brw_reg dst,
271 struct brw_reg arg0,
272 struct brw_reg arg1 )
273 {
274 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
275 }
276
277 static void emit_sge( struct brw_compile *p,
278 struct brw_reg dst,
279 struct brw_reg arg0,
280 struct brw_reg arg1 )
281 {
282 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
283 }
284
285 static void emit_max( struct brw_compile *p,
286 struct brw_reg dst,
287 struct brw_reg arg0,
288 struct brw_reg arg1 )
289 {
290 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
291 brw_SEL(p, dst, arg1, arg0);
292 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
293 }
294
295 static void emit_min( struct brw_compile *p,
296 struct brw_reg dst,
297 struct brw_reg arg0,
298 struct brw_reg arg1 )
299 {
300 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
301 brw_SEL(p, dst, arg0, arg1);
302 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
303 }
304
305
306 static void emit_math1( struct brw_vs_compile *c,
307 GLuint function,
308 struct brw_reg dst,
309 struct brw_reg arg0,
310 GLuint precision)
311 {
312 /* There are various odd behaviours with SEND on the simulator. In
313 * addition there are documented issues with the fact that the GEN4
314 * processor doesn't do dependency control properly on SEND
315 * results. So, on balance, this kludge to get around failures
316 * with writemasked math results looks like it might be necessary
317 * whether that turns out to be a simulator bug or not:
318 */
319 struct brw_compile *p = &c->func;
320 struct brw_reg tmp = dst;
321 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
322 dst.file != BRW_GENERAL_REGISTER_FILE);
323
324 if (need_tmp)
325 tmp = get_tmp(c);
326
327 brw_math(p,
328 tmp,
329 function,
330 BRW_MATH_SATURATE_NONE,
331 2,
332 arg0,
333 BRW_MATH_DATA_SCALAR,
334 precision);
335
336 if (need_tmp) {
337 brw_MOV(p, dst, tmp);
338 release_tmp(c, tmp);
339 }
340 }
341
342 static void emit_math2( struct brw_vs_compile *c,
343 GLuint function,
344 struct brw_reg dst,
345 struct brw_reg arg0,
346 struct brw_reg arg1,
347 GLuint precision)
348 {
349 struct brw_compile *p = &c->func;
350 struct brw_reg tmp = dst;
351 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
352 dst.file != BRW_GENERAL_REGISTER_FILE);
353
354 if (need_tmp)
355 tmp = get_tmp(c);
356
357 brw_MOV(p, brw_message_reg(3), arg1);
358
359 brw_math(p,
360 tmp,
361 function,
362 BRW_MATH_SATURATE_NONE,
363 2,
364 arg0,
365 BRW_MATH_DATA_SCALAR,
366 precision);
367
368 if (need_tmp) {
369 brw_MOV(p, dst, tmp);
370 release_tmp(c, tmp);
371 }
372 }
373
374
375
376 static void emit_exp_noalias( struct brw_vs_compile *c,
377 struct brw_reg dst,
378 struct brw_reg arg0 )
379 {
380 struct brw_compile *p = &c->func;
381
382
383 if (dst.dw1.bits.writemask & WRITEMASK_X) {
384 struct brw_reg tmp = get_tmp(c);
385 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
386
387 /* tmp_d = floor(arg0.x) */
388 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
389
390 /* result[0] = 2.0 ^ tmp */
391
392 /* Adjust exponent for floating point:
393 * exp += 127
394 */
395 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
396
397 /* Install exponent and sign.
398 * Excess drops off the edge:
399 */
400 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
401 tmp_d, brw_imm_d(23));
402
403 release_tmp(c, tmp);
404 }
405
406 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
407 /* result[1] = arg0.x - floor(arg0.x) */
408 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
409 }
410
411 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
412 /* As with the LOG instruction, we might be better off just
413 * doing a taylor expansion here, seeing as we have to do all
414 * the prep work.
415 *
416 * If mathbox partial precision is too low, consider also:
417 * result[3] = result[0] * EXP(result[1])
418 */
419 emit_math1(c,
420 BRW_MATH_FUNCTION_EXP,
421 brw_writemask(dst, WRITEMASK_Z),
422 brw_swizzle1(arg0, 0),
423 BRW_MATH_PRECISION_PARTIAL);
424 }
425
426 if (dst.dw1.bits.writemask & WRITEMASK_W) {
427 /* result[3] = 1.0; */
428 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
429 }
430 }
431
432
433 static void emit_log_noalias( struct brw_vs_compile *c,
434 struct brw_reg dst,
435 struct brw_reg arg0 )
436 {
437 struct brw_compile *p = &c->func;
438 struct brw_reg tmp = dst;
439 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
440 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
441 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
442 dst.file != BRW_GENERAL_REGISTER_FILE);
443
444 if (need_tmp) {
445 tmp = get_tmp(c);
446 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
447 }
448
449 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
450 * according to spec:
451 *
452 * These almost look likey they could be joined up, but not really
453 * practical:
454 *
455 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
456 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
457 */
458 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
459 brw_AND(p,
460 brw_writemask(tmp_ud, WRITEMASK_X),
461 brw_swizzle1(arg0_ud, 0),
462 brw_imm_ud((1U<<31)-1));
463
464 brw_SHR(p,
465 brw_writemask(tmp_ud, WRITEMASK_X),
466 tmp_ud,
467 brw_imm_ud(23));
468
469 brw_ADD(p,
470 brw_writemask(tmp, WRITEMASK_X),
471 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
472 brw_imm_d(-127));
473 }
474
475 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
476 brw_AND(p,
477 brw_writemask(tmp_ud, WRITEMASK_Y),
478 brw_swizzle1(arg0_ud, 0),
479 brw_imm_ud((1<<23)-1));
480
481 brw_OR(p,
482 brw_writemask(tmp_ud, WRITEMASK_Y),
483 tmp_ud,
484 brw_imm_ud(127<<23));
485 }
486
487 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
488 /* result[2] = result[0] + LOG2(result[1]); */
489
490 /* Why bother? The above is just a hint how to do this with a
491 * taylor series. Maybe we *should* use a taylor series as by
492 * the time all the above has been done it's almost certainly
493 * quicker than calling the mathbox, even with low precision.
494 *
495 * Options are:
496 * - result[0] + mathbox.LOG2(result[1])
497 * - mathbox.LOG2(arg0.x)
498 * - result[0] + inline_taylor_approx(result[1])
499 */
500 emit_math1(c,
501 BRW_MATH_FUNCTION_LOG,
502 brw_writemask(tmp, WRITEMASK_Z),
503 brw_swizzle1(tmp, 1),
504 BRW_MATH_PRECISION_FULL);
505
506 brw_ADD(p,
507 brw_writemask(tmp, WRITEMASK_Z),
508 brw_swizzle1(tmp, 2),
509 brw_swizzle1(tmp, 0));
510 }
511
512 if (dst.dw1.bits.writemask & WRITEMASK_W) {
513 /* result[3] = 1.0; */
514 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
515 }
516
517 if (need_tmp) {
518 brw_MOV(p, dst, tmp);
519 release_tmp(c, tmp);
520 }
521 }
522
523
524
525
526 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
527 */
528 static void emit_dst_noalias( struct brw_vs_compile *c,
529 struct brw_reg dst,
530 struct brw_reg arg0,
531 struct brw_reg arg1)
532 {
533 struct brw_compile *p = &c->func;
534
535 /* There must be a better way to do this:
536 */
537 if (dst.dw1.bits.writemask & WRITEMASK_X)
538 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
539 if (dst.dw1.bits.writemask & WRITEMASK_Y)
540 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
541 if (dst.dw1.bits.writemask & WRITEMASK_Z)
542 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
543 if (dst.dw1.bits.writemask & WRITEMASK_W)
544 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
545 }
546
547 static void emit_xpd( struct brw_compile *p,
548 struct brw_reg dst,
549 struct brw_reg t,
550 struct brw_reg u)
551 {
552 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
553 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
554 }
555
556
557
558 static void emit_lit_noalias( struct brw_vs_compile *c,
559 struct brw_reg dst,
560 struct brw_reg arg0 )
561 {
562 struct brw_compile *p = &c->func;
563 struct brw_instruction *if_insn;
564 struct brw_reg tmp = dst;
565 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
566
567 if (need_tmp)
568 tmp = get_tmp(c);
569
570 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
571 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
572
573 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
574 * to get all channels active inside the IF. In the clipping code
575 * we run with NoMask, so it's not an option and we can use
576 * BRW_EXECUTE_1 for all comparisions.
577 */
578 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
579 if_insn = brw_IF(p, BRW_EXECUTE_8);
580 {
581 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
582
583 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
584 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
585 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
586
587 emit_math2(c,
588 BRW_MATH_FUNCTION_POW,
589 brw_writemask(dst, WRITEMASK_Z),
590 brw_swizzle1(tmp, 2),
591 brw_swizzle1(arg0, 3),
592 BRW_MATH_PRECISION_PARTIAL);
593 }
594
595 brw_ENDIF(p, if_insn);
596 }
597
598
599
600
601
602 /* TODO: relative addressing!
603 */
604 static struct brw_reg get_reg( struct brw_vs_compile *c,
605 GLuint file,
606 GLuint index )
607 {
608
609 switch (file) {
610 case PROGRAM_TEMPORARY:
611 case PROGRAM_INPUT:
612 case PROGRAM_OUTPUT:
613 assert(c->regs[file][index].nr != 0);
614 return c->regs[file][index];
615 case PROGRAM_STATE_VAR:
616 case PROGRAM_CONSTANT:
617 case PROGRAM_UNIFORM:
618 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
619 return c->regs[PROGRAM_STATE_VAR][index];
620 case PROGRAM_ADDRESS:
621 assert(index == 0);
622 return c->regs[file][index];
623
624 case PROGRAM_UNDEFINED: /* undef values */
625 return brw_null_reg();
626
627 case PROGRAM_LOCAL_PARAM:
628 case PROGRAM_ENV_PARAM:
629 case PROGRAM_WRITE_ONLY:
630 default:
631 assert(0);
632 return brw_null_reg();
633 }
634 }
635
636
637
638 static struct brw_reg deref( struct brw_vs_compile *c,
639 struct brw_reg arg,
640 GLint offset)
641 {
642 struct brw_compile *p = &c->func;
643 struct brw_reg tmp = vec4(get_tmp(c));
644 struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
645 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
646 struct brw_reg indirect = brw_vec4_indirect(0,0);
647
648 {
649 brw_push_insn_state(p);
650 brw_set_access_mode(p, BRW_ALIGN_1);
651
652 /* This is pretty clunky - load the address register twice and
653 * fetch each 4-dword value in turn. There must be a way to do
654 * this in a single pass, but I couldn't get it to work.
655 */
656 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
657 brw_MOV(p, tmp, indirect);
658
659 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
660 brw_MOV(p, suboffset(tmp, 4), indirect);
661
662 brw_pop_insn_state(p);
663 }
664
665 return vec8(tmp);
666 }
667
668
669 static void emit_arl( struct brw_vs_compile *c,
670 struct brw_reg dst,
671 struct brw_reg arg0 )
672 {
673 struct brw_compile *p = &c->func;
674 struct brw_reg tmp = dst;
675 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
676
677 if (need_tmp)
678 tmp = get_tmp(c);
679
680 brw_RNDD(p, tmp, arg0);
681 brw_MUL(p, dst, tmp, brw_imm_d(16));
682
683 if (need_tmp)
684 release_tmp(c, tmp);
685 }
686
687
688 /* Will return mangled results for SWZ op. The emit_swz() function
689 * ignores this result and recalculates taking extended swizzles into
690 * account.
691 */
692 static struct brw_reg get_arg( struct brw_vs_compile *c,
693 struct prog_src_register *src )
694 {
695 struct brw_reg reg;
696
697 if (src->File == PROGRAM_UNDEFINED)
698 return brw_null_reg();
699
700 if (src->RelAddr)
701 reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
702 else
703 reg = get_reg(c, src->File, src->Index);
704
705 /* Convert 3-bit swizzle to 2-bit.
706 */
707 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
708 GET_SWZ(src->Swizzle, 1),
709 GET_SWZ(src->Swizzle, 2),
710 GET_SWZ(src->Swizzle, 3));
711
712 /* Note this is ok for non-swizzle instructions:
713 */
714 reg.negate = src->NegateBase ? 1 : 0;
715
716 return reg;
717 }
718
719
720 static struct brw_reg get_dst( struct brw_vs_compile *c,
721 struct prog_dst_register dst )
722 {
723 struct brw_reg reg = get_reg(c, dst.File, dst.Index);
724
725 reg.dw1.bits.writemask = dst.WriteMask;
726
727 return reg;
728 }
729
730
731
732
733 static void emit_swz( struct brw_vs_compile *c,
734 struct brw_reg dst,
735 struct prog_src_register src )
736 {
737 struct brw_compile *p = &c->func;
738 GLuint zeros_mask = 0;
739 GLuint ones_mask = 0;
740 GLuint src_mask = 0;
741 GLubyte src_swz[4];
742 GLboolean need_tmp = (src.NegateBase &&
743 dst.file != BRW_GENERAL_REGISTER_FILE);
744 struct brw_reg tmp = dst;
745 GLuint i;
746
747 if (need_tmp)
748 tmp = get_tmp(c);
749
750 for (i = 0; i < 4; i++) {
751 if (dst.dw1.bits.writemask & (1<<i)) {
752 GLubyte s = GET_SWZ(src.Swizzle, i);
753 switch (s) {
754 case SWIZZLE_X:
755 case SWIZZLE_Y:
756 case SWIZZLE_Z:
757 case SWIZZLE_W:
758 src_mask |= 1<<i;
759 src_swz[i] = s;
760 break;
761 case SWIZZLE_ZERO:
762 zeros_mask |= 1<<i;
763 break;
764 case SWIZZLE_ONE:
765 ones_mask |= 1<<i;
766 break;
767 }
768 }
769 }
770
771 /* Do src first, in case dst aliases src:
772 */
773 if (src_mask) {
774 struct brw_reg arg0;
775
776 if (src.RelAddr)
777 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
778 else
779 arg0 = get_reg(c, src.File, src.Index);
780
781 arg0 = brw_swizzle(arg0,
782 src_swz[0], src_swz[1],
783 src_swz[2], src_swz[3]);
784
785 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
786 }
787
788 if (zeros_mask)
789 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
790
791 if (ones_mask)
792 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
793
794 if (src.NegateBase)
795 brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
796
797 if (need_tmp) {
798 brw_MOV(p, dst, tmp);
799 release_tmp(c, tmp);
800 }
801 }
802
803
804
805 /* Post-vertex-program processing. Send the results to the URB.
806 */
807 static void emit_vertex_write( struct brw_vs_compile *c)
808 {
809 struct brw_compile *p = &c->func;
810 struct brw_reg m0 = brw_message_reg(0);
811 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
812 struct brw_reg ndc;
813
814 if (c->key.copy_edgeflag) {
815 brw_MOV(p,
816 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
817 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
818 }
819
820
821 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
822 */
823 if (!c->key.know_w_is_one) {
824 ndc = get_tmp(c);
825 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
826 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
827 }
828 else {
829 ndc = pos;
830 }
831
832 /* This includes the workaround for -ve rhw, so is no longer an
833 * optional step:
834 */
835 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
836 c->key.nr_userclip ||
837 !c->key.know_w_is_one)
838 {
839 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
840 GLuint i;
841
842 brw_MOV(p, header1, brw_imm_ud(0));
843
844 brw_set_access_mode(p, BRW_ALIGN_16);
845
846 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
847 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
848 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
849 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
850 }
851
852
853 for (i = 0; i < c->key.nr_userclip; i++) {
854 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
855 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
856 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
857 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
858 }
859
860
861 /* i965 clipping workaround:
862 * 1) Test for -ve rhw
863 * 2) If set,
864 * set ndc = (0,0,0,0)
865 * set ucp[6] = 1
866 *
867 * Later, clipping will detect ucp[6] and ensure the primitive is
868 * clipped against all fixed planes.
869 */
870 if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw)) && !c->key.know_w_is_one) {
871 brw_CMP(p,
872 vec8(brw_null_reg()),
873 BRW_CONDITIONAL_L,
874 brw_swizzle1(ndc, 3),
875 brw_imm_f(0));
876
877 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
878 brw_MOV(p, ndc, brw_imm_f(0));
879 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
880 }
881
882 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
883 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
884 brw_set_access_mode(p, BRW_ALIGN_16);
885
886 release_tmp(c, header1);
887 }
888 else {
889 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
890 }
891
892
893 /* Emit the (interleaved) headers for the two vertices - an 8-reg
894 * of zeros followed by two sets of NDC coordinates:
895 */
896 brw_set_access_mode(p, BRW_ALIGN_1);
897 brw_MOV(p, offset(m0, 2), ndc);
898 brw_MOV(p, offset(m0, 3), pos);
899
900
901 brw_urb_WRITE(p,
902 brw_null_reg(), /* dest */
903 0, /* starting mrf reg nr */
904 c->r0, /* src */
905 0, /* allocate */
906 1, /* used */
907 c->nr_outputs + 3, /* msg len */
908 0, /* response len */
909 1, /* eot */
910 1, /* writes complete */
911 0, /* urb destination offset */
912 BRW_URB_SWIZZLE_INTERLEAVE);
913
914 }
915
916 static void
917 post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
918 {
919 GLuint nr_insns = c->vp->program.Base.NumInstructions;
920 GLuint insn, target_insn;
921 struct prog_instruction *inst1, *inst2;
922 struct brw_instruction *brw_inst1, *brw_inst2;
923 int offset;
924 for (insn = 0; insn < nr_insns; insn++) {
925 inst1 = &c->vp->program.Base.Instructions[insn];
926 brw_inst1 = inst1->Data;
927 switch (inst1->Opcode) {
928 case OPCODE_CAL:
929 case OPCODE_BRA:
930 target_insn = inst1->BranchTarget;
931 inst2 = &c->vp->program.Base.Instructions[target_insn];
932 brw_inst2 = inst2->Data;
933 offset = brw_inst2 - brw_inst1;
934 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
935 break;
936 case OPCODE_END:
937 offset = end_inst - brw_inst1;
938 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
939 break;
940 default:
941 break;
942 }
943 }
944 }
945
946 /* Emit the fragment program instructions here.
947 */
948 void brw_vs_emit(struct brw_vs_compile *c )
949 {
950 #define MAX_IFSN 32
951 struct brw_compile *p = &c->func;
952 GLuint nr_insns = c->vp->program.Base.NumInstructions;
953 GLuint insn, if_insn = 0;
954 struct brw_instruction *end_inst;
955 struct brw_instruction *if_inst[MAX_IFSN];
956 struct brw_indirect stack_index = brw_indirect(0, 0);
957
958 GLuint index;
959 GLuint file;
960
961 if (INTEL_DEBUG & DEBUG_VS) {
962 _mesa_printf("\n\n\nvs-emit:\n");
963 _mesa_print_program(&c->vp->program.Base);
964 _mesa_printf("\n");
965 }
966
967 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
968 brw_set_access_mode(p, BRW_ALIGN_16);
969
970 /* Message registers can't be read, so copy the output into GRF register
971 if they are used in source registers */
972 for (insn = 0; insn < nr_insns; insn++) {
973 GLuint i;
974 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
975 for (i = 0; i < 3; i++) {
976 struct prog_src_register *src = &inst->SrcReg[i];
977 GLuint index = src->Index;
978 GLuint file = src->File;
979 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
980 c->output_regs[index].used_in_src = GL_TRUE;
981 }
982 }
983
984 /* Static register allocation
985 */
986 brw_vs_alloc_regs(c);
987 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
988
989 for (insn = 0; insn < nr_insns; insn++) {
990
991 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
992 struct brw_reg args[3], dst;
993 GLuint i;
994
995 /* Get argument regs. SWZ is special and does this itself.
996 */
997 inst->Data = &p->store[p->nr_insn];
998 if (inst->Opcode != OPCODE_SWZ)
999 for (i = 0; i < 3; i++) {
1000 struct prog_src_register *src = &inst->SrcReg[i];
1001 index = src->Index;
1002 file = src->File;
1003 if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
1004 args[i] = c->output_regs[index].reg;
1005 else
1006 args[i] = get_arg(c, src);
1007 }
1008
1009 /* Get dest regs. Note that it is possible for a reg to be both
1010 * dst and arg, given the static allocation of registers. So
1011 * care needs to be taken emitting multi-operation instructions.
1012 */
1013 index = inst->DstReg.Index;
1014 file = inst->DstReg.File;
1015 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1016 dst = c->output_regs[index].reg;
1017 else
1018 dst = get_dst(c, inst->DstReg);
1019
1020 switch (inst->Opcode) {
1021 case OPCODE_ABS:
1022 brw_MOV(p, dst, brw_abs(args[0]));
1023 break;
1024 case OPCODE_ADD:
1025 brw_ADD(p, dst, args[0], args[1]);
1026 break;
1027 case OPCODE_DP3:
1028 brw_DP3(p, dst, args[0], args[1]);
1029 break;
1030 case OPCODE_DP4:
1031 brw_DP4(p, dst, args[0], args[1]);
1032 break;
1033 case OPCODE_DPH:
1034 brw_DPH(p, dst, args[0], args[1]);
1035 break;
1036 case OPCODE_DST:
1037 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1038 break;
1039 case OPCODE_EXP:
1040 unalias1(c, dst, args[0], emit_exp_noalias);
1041 break;
1042 case OPCODE_EX2:
1043 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1044 break;
1045 case OPCODE_ARL:
1046 emit_arl(c, dst, args[0]);
1047 break;
1048 case OPCODE_FLR:
1049 brw_RNDD(p, dst, args[0]);
1050 break;
1051 case OPCODE_FRC:
1052 brw_FRC(p, dst, args[0]);
1053 break;
1054 case OPCODE_LOG:
1055 unalias1(c, dst, args[0], emit_log_noalias);
1056 break;
1057 case OPCODE_LG2:
1058 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1059 break;
1060 case OPCODE_LIT:
1061 unalias1(c, dst, args[0], emit_lit_noalias);
1062 break;
1063 case OPCODE_MAD:
1064 brw_MOV(p, brw_acc_reg(), args[2]);
1065 brw_MAC(p, dst, args[0], args[1]);
1066 break;
1067 case OPCODE_MAX:
1068 emit_max(p, dst, args[0], args[1]);
1069 break;
1070 case OPCODE_MIN:
1071 emit_min(p, dst, args[0], args[1]);
1072 break;
1073 case OPCODE_MOV:
1074 brw_MOV(p, dst, args[0]);
1075 break;
1076 case OPCODE_MUL:
1077 brw_MUL(p, dst, args[0], args[1]);
1078 break;
1079 case OPCODE_POW:
1080 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1081 break;
1082 case OPCODE_RCP:
1083 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1084 break;
1085 case OPCODE_RSQ:
1086 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1087 break;
1088
1089 case OPCODE_SEQ:
1090 emit_seq(p, dst, args[0], args[1]);
1091 break;
1092 case OPCODE_SNE:
1093 emit_sne(p, dst, args[0], args[1]);
1094 break;
1095 case OPCODE_SGE:
1096 emit_sge(p, dst, args[0], args[1]);
1097 break;
1098 case OPCODE_SGT:
1099 emit_sgt(p, dst, args[0], args[1]);
1100 break;
1101 case OPCODE_SLT:
1102 emit_slt(p, dst, args[0], args[1]);
1103 break;
1104 case OPCODE_SLE:
1105 emit_sle(p, dst, args[0], args[1]);
1106 break;
1107 case OPCODE_SUB:
1108 brw_ADD(p, dst, args[0], negate(args[1]));
1109 break;
1110 case OPCODE_SWZ:
1111 /* The args[0] value can't be used here as it won't have
1112 * correctly encoded the full swizzle:
1113 */
1114 emit_swz(c, dst, inst->SrcReg[0] );
1115 break;
1116 case OPCODE_XPD:
1117 emit_xpd(p, dst, args[0], args[1]);
1118 break;
1119 case OPCODE_IF:
1120 assert(if_insn < MAX_IFSN);
1121 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1122 break;
1123 case OPCODE_ELSE:
1124 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1125 break;
1126 case OPCODE_ENDIF:
1127 assert(if_insn > 0);
1128 brw_ENDIF(p, if_inst[--if_insn]);
1129 break;
1130 case OPCODE_BRA:
1131 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1132 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1133 brw_set_predicate_control_flag_value(p, 0xff);
1134 break;
1135 case OPCODE_CAL:
1136 brw_set_access_mode(p, BRW_ALIGN_1);
1137 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1138 brw_set_access_mode(p, BRW_ALIGN_16);
1139 brw_ADD(p, get_addr_reg(stack_index),
1140 get_addr_reg(stack_index), brw_imm_d(4));
1141 inst->Data = &p->store[p->nr_insn];
1142 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1143 break;
1144 case OPCODE_RET:
1145 brw_ADD(p, get_addr_reg(stack_index),
1146 get_addr_reg(stack_index), brw_imm_d(-4));
1147 brw_set_access_mode(p, BRW_ALIGN_1);
1148 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
1149 brw_set_access_mode(p, BRW_ALIGN_16);
1150 case OPCODE_END:
1151 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1152 break;
1153 case OPCODE_PRINT:
1154 case OPCODE_BGNSUB:
1155 case OPCODE_ENDSUB:
1156 break;
1157 default:
1158 _mesa_printf("Unsupport opcode %d in vertex shader\n", inst->Opcode);
1159 break;
1160 }
1161
1162 if (inst->DstReg.File == PROGRAM_OUTPUT
1163 &&inst->DstReg.Index != VERT_RESULT_HPOS
1164 &&c->output_regs[inst->DstReg.Index].used_in_src)
1165 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1166
1167 release_tmps(c);
1168 }
1169
1170 end_inst = &p->store[p->nr_insn];
1171 emit_vertex_write(c);
1172 post_vs_emit(c, end_inst);
1173 for (insn = 0; insn < nr_insns; insn++)
1174 c->vp->program.Base.Instructions[insn].Data = NULL;
1175 }