[i915] Include header to pick up intel_ttm_bo_create_from_handle() proto.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "shader/program.h"
35 #include "shader/prog_parameter.h"
36 #include "shader/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40
41
42 /* Do things as simply as possible. Allocate and populate all regs
43 * ahead of time.
44 */
45 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
46 {
47 GLuint i, reg = 0, mrf;
48 GLuint nr_params;
49
50 /* r0 -- reserved as usual
51 */
52 c->r0 = brw_vec8_grf(reg, 0); reg++;
53
54 /* User clip planes from curbe:
55 */
56 if (c->key.nr_userclip) {
57 for (i = 0; i < c->key.nr_userclip; i++) {
58 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
59 }
60
61 /* Deal with curbe alignment:
62 */
63 reg += ((6+c->key.nr_userclip+3)/4)*2;
64 }
65
66 /* Vertex program parameters from curbe:
67 */
68 nr_params = c->vp->program.Base.Parameters->NumParameters;
69 for (i = 0; i < nr_params; i++) {
70 c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
71 }
72 reg += (nr_params+1)/2;
73
74 c->prog_data.curb_read_length = reg - 1;
75
76
77
78 /* Allocate input regs:
79 */
80 c->nr_inputs = 0;
81 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
82 if (c->prog_data.inputs_read & (1<<i)) {
83 c->nr_inputs++;
84 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
85 reg++;
86 }
87 }
88
89
90 /* Allocate outputs: TODO: could organize the non-position outputs
91 * to go straight into message regs.
92 */
93 c->nr_outputs = 0;
94 c->first_output = reg;
95 mrf = 4;
96 for (i = 0; i < VERT_RESULT_MAX; i++) {
97 if (c->prog_data.outputs_written & (1<<i)) {
98 c->nr_outputs++;
99 if (i == VERT_RESULT_HPOS) {
100 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
101 reg++;
102 }
103 else if (i == VERT_RESULT_PSIZ) {
104 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
105 reg++;
106 mrf++; /* just a placeholder? XXX fix later stages & remove this */
107 }
108 else {
109 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
110 mrf++;
111 }
112 }
113 }
114
115 /* Allocate program temporaries:
116 */
117 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
118 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
119 reg++;
120 }
121
122 /* Address reg(s). Don't try to use the internal address reg until
123 * deref time.
124 */
125 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
126 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
127 reg,
128 0,
129 BRW_REGISTER_TYPE_D,
130 BRW_VERTICAL_STRIDE_8,
131 BRW_WIDTH_8,
132 BRW_HORIZONTAL_STRIDE_1,
133 BRW_SWIZZLE_XXXX,
134 WRITEMASK_X);
135 reg++;
136 }
137
138 for (i = 0; i < 128; i++) {
139 if (c->output_regs[i].used_in_src) {
140 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
141 reg++;
142 }
143 }
144
145 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
146 reg += 2;
147
148
149 /* Some opcodes need an internal temporary:
150 */
151 c->first_tmp = reg;
152 c->last_tmp = reg; /* for allocation purposes */
153
154 /* Each input reg holds data from two vertices. The
155 * urb_read_length is the number of registers read from *each*
156 * vertex urb, so is half the amount:
157 */
158 c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
159
160 c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
161 c->prog_data.total_grf = reg;
162 }
163
164
165 static struct brw_reg get_tmp( struct brw_vs_compile *c )
166 {
167 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
168
169 if (++c->last_tmp > c->prog_data.total_grf)
170 c->prog_data.total_grf = c->last_tmp;
171
172 return tmp;
173 }
174
175 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
176 {
177 if (tmp.nr == c->last_tmp-1)
178 c->last_tmp--;
179 }
180
181 static void release_tmps( struct brw_vs_compile *c )
182 {
183 c->last_tmp = c->first_tmp;
184 }
185
186
187 static void unalias1( struct brw_vs_compile *c,
188 struct brw_reg dst,
189 struct brw_reg arg0,
190 void (*func)( struct brw_vs_compile *,
191 struct brw_reg,
192 struct brw_reg ))
193 {
194 if (dst.file == arg0.file && dst.nr == arg0.nr) {
195 struct brw_compile *p = &c->func;
196 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
197 func(c, tmp, arg0);
198 brw_MOV(p, dst, tmp);
199 }
200 else {
201 func(c, dst, arg0);
202 }
203 }
204
205 static void unalias2( struct brw_vs_compile *c,
206 struct brw_reg dst,
207 struct brw_reg arg0,
208 struct brw_reg arg1,
209 void (*func)( struct brw_vs_compile *,
210 struct brw_reg,
211 struct brw_reg,
212 struct brw_reg ))
213 {
214 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
215 (dst.file == arg1.file && dst.nr == arg1.nr)) {
216 struct brw_compile *p = &c->func;
217 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
218 func(c, tmp, arg0, arg1);
219 brw_MOV(p, dst, tmp);
220 }
221 else {
222 func(c, dst, arg0, arg1);
223 }
224 }
225
226 static void emit_sop( struct brw_compile *p,
227 struct brw_reg dst,
228 struct brw_reg arg0,
229 struct brw_reg arg1,
230 GLuint cond)
231 {
232 brw_push_insn_state(p);
233 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
234 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
235 brw_MOV(p, dst, brw_imm_f(1.0f));
236 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
237 brw_MOV(p, dst, brw_imm_f(0.0f));
238 brw_pop_insn_state(p);
239 }
240
241 static void emit_seq( struct brw_compile *p,
242 struct brw_reg dst,
243 struct brw_reg arg0,
244 struct brw_reg arg1 )
245 {
246 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
247 }
248
249 static void emit_sne( struct brw_compile *p,
250 struct brw_reg dst,
251 struct brw_reg arg0,
252 struct brw_reg arg1 )
253 {
254 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
255 }
256 static void emit_slt( struct brw_compile *p,
257 struct brw_reg dst,
258 struct brw_reg arg0,
259 struct brw_reg arg1 )
260 {
261 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
262 }
263
264 static void emit_sle( struct brw_compile *p,
265 struct brw_reg dst,
266 struct brw_reg arg0,
267 struct brw_reg arg1 )
268 {
269 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
270 }
271
272 static void emit_sgt( struct brw_compile *p,
273 struct brw_reg dst,
274 struct brw_reg arg0,
275 struct brw_reg arg1 )
276 {
277 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
278 }
279
280 static void emit_sge( struct brw_compile *p,
281 struct brw_reg dst,
282 struct brw_reg arg0,
283 struct brw_reg arg1 )
284 {
285 emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
286 }
287
288 static void emit_max( struct brw_compile *p,
289 struct brw_reg dst,
290 struct brw_reg arg0,
291 struct brw_reg arg1 )
292 {
293 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
294 brw_SEL(p, dst, arg1, arg0);
295 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
296 }
297
298 static void emit_min( struct brw_compile *p,
299 struct brw_reg dst,
300 struct brw_reg arg0,
301 struct brw_reg arg1 )
302 {
303 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
304 brw_SEL(p, dst, arg0, arg1);
305 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
306 }
307
308
309 static void emit_math1( struct brw_vs_compile *c,
310 GLuint function,
311 struct brw_reg dst,
312 struct brw_reg arg0,
313 GLuint precision)
314 {
315 /* There are various odd behaviours with SEND on the simulator. In
316 * addition there are documented issues with the fact that the GEN4
317 * processor doesn't do dependency control properly on SEND
318 * results. So, on balance, this kludge to get around failures
319 * with writemasked math results looks like it might be necessary
320 * whether that turns out to be a simulator bug or not:
321 */
322 struct brw_compile *p = &c->func;
323 struct brw_reg tmp = dst;
324 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
325 dst.file != BRW_GENERAL_REGISTER_FILE);
326
327 if (need_tmp)
328 tmp = get_tmp(c);
329
330 brw_math(p,
331 tmp,
332 function,
333 BRW_MATH_SATURATE_NONE,
334 2,
335 arg0,
336 BRW_MATH_DATA_SCALAR,
337 precision);
338
339 if (need_tmp) {
340 brw_MOV(p, dst, tmp);
341 release_tmp(c, tmp);
342 }
343 }
344
345 static void emit_math2( struct brw_vs_compile *c,
346 GLuint function,
347 struct brw_reg dst,
348 struct brw_reg arg0,
349 struct brw_reg arg1,
350 GLuint precision)
351 {
352 struct brw_compile *p = &c->func;
353 struct brw_reg tmp = dst;
354 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
355 dst.file != BRW_GENERAL_REGISTER_FILE);
356
357 if (need_tmp)
358 tmp = get_tmp(c);
359
360 brw_MOV(p, brw_message_reg(3), arg1);
361
362 brw_math(p,
363 tmp,
364 function,
365 BRW_MATH_SATURATE_NONE,
366 2,
367 arg0,
368 BRW_MATH_DATA_SCALAR,
369 precision);
370
371 if (need_tmp) {
372 brw_MOV(p, dst, tmp);
373 release_tmp(c, tmp);
374 }
375 }
376
377
378
379 static void emit_exp_noalias( struct brw_vs_compile *c,
380 struct brw_reg dst,
381 struct brw_reg arg0 )
382 {
383 struct brw_compile *p = &c->func;
384
385
386 if (dst.dw1.bits.writemask & WRITEMASK_X) {
387 struct brw_reg tmp = get_tmp(c);
388 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
389
390 /* tmp_d = floor(arg0.x) */
391 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
392
393 /* result[0] = 2.0 ^ tmp */
394
395 /* Adjust exponent for floating point:
396 * exp += 127
397 */
398 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
399
400 /* Install exponent and sign.
401 * Excess drops off the edge:
402 */
403 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
404 tmp_d, brw_imm_d(23));
405
406 release_tmp(c, tmp);
407 }
408
409 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
410 /* result[1] = arg0.x - floor(arg0.x) */
411 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
412 }
413
414 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
415 /* As with the LOG instruction, we might be better off just
416 * doing a taylor expansion here, seeing as we have to do all
417 * the prep work.
418 *
419 * If mathbox partial precision is too low, consider also:
420 * result[3] = result[0] * EXP(result[1])
421 */
422 emit_math1(c,
423 BRW_MATH_FUNCTION_EXP,
424 brw_writemask(dst, WRITEMASK_Z),
425 brw_swizzle1(arg0, 0),
426 BRW_MATH_PRECISION_PARTIAL);
427 }
428
429 if (dst.dw1.bits.writemask & WRITEMASK_W) {
430 /* result[3] = 1.0; */
431 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
432 }
433 }
434
435
436 static void emit_log_noalias( struct brw_vs_compile *c,
437 struct brw_reg dst,
438 struct brw_reg arg0 )
439 {
440 struct brw_compile *p = &c->func;
441 struct brw_reg tmp = dst;
442 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
443 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
444 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
445 dst.file != BRW_GENERAL_REGISTER_FILE);
446
447 if (need_tmp) {
448 tmp = get_tmp(c);
449 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
450 }
451
452 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
453 * according to spec:
454 *
455 * These almost look likey they could be joined up, but not really
456 * practical:
457 *
458 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
459 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
460 */
461 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
462 brw_AND(p,
463 brw_writemask(tmp_ud, WRITEMASK_X),
464 brw_swizzle1(arg0_ud, 0),
465 brw_imm_ud((1U<<31)-1));
466
467 brw_SHR(p,
468 brw_writemask(tmp_ud, WRITEMASK_X),
469 tmp_ud,
470 brw_imm_ud(23));
471
472 brw_ADD(p,
473 brw_writemask(tmp, WRITEMASK_X),
474 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
475 brw_imm_d(-127));
476 }
477
478 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
479 brw_AND(p,
480 brw_writemask(tmp_ud, WRITEMASK_Y),
481 brw_swizzle1(arg0_ud, 0),
482 brw_imm_ud((1<<23)-1));
483
484 brw_OR(p,
485 brw_writemask(tmp_ud, WRITEMASK_Y),
486 tmp_ud,
487 brw_imm_ud(127<<23));
488 }
489
490 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
491 /* result[2] = result[0] + LOG2(result[1]); */
492
493 /* Why bother? The above is just a hint how to do this with a
494 * taylor series. Maybe we *should* use a taylor series as by
495 * the time all the above has been done it's almost certainly
496 * quicker than calling the mathbox, even with low precision.
497 *
498 * Options are:
499 * - result[0] + mathbox.LOG2(result[1])
500 * - mathbox.LOG2(arg0.x)
501 * - result[0] + inline_taylor_approx(result[1])
502 */
503 emit_math1(c,
504 BRW_MATH_FUNCTION_LOG,
505 brw_writemask(tmp, WRITEMASK_Z),
506 brw_swizzle1(tmp, 1),
507 BRW_MATH_PRECISION_FULL);
508
509 brw_ADD(p,
510 brw_writemask(tmp, WRITEMASK_Z),
511 brw_swizzle1(tmp, 2),
512 brw_swizzle1(tmp, 0));
513 }
514
515 if (dst.dw1.bits.writemask & WRITEMASK_W) {
516 /* result[3] = 1.0; */
517 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
518 }
519
520 if (need_tmp) {
521 brw_MOV(p, dst, tmp);
522 release_tmp(c, tmp);
523 }
524 }
525
526
527
528
529 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
530 */
531 static void emit_dst_noalias( struct brw_vs_compile *c,
532 struct brw_reg dst,
533 struct brw_reg arg0,
534 struct brw_reg arg1)
535 {
536 struct brw_compile *p = &c->func;
537
538 /* There must be a better way to do this:
539 */
540 if (dst.dw1.bits.writemask & WRITEMASK_X)
541 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
542 if (dst.dw1.bits.writemask & WRITEMASK_Y)
543 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
544 if (dst.dw1.bits.writemask & WRITEMASK_Z)
545 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
546 if (dst.dw1.bits.writemask & WRITEMASK_W)
547 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
548 }
549
550 static void emit_xpd( struct brw_compile *p,
551 struct brw_reg dst,
552 struct brw_reg t,
553 struct brw_reg u)
554 {
555 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
556 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
557 }
558
559
560
561 static void emit_lit_noalias( struct brw_vs_compile *c,
562 struct brw_reg dst,
563 struct brw_reg arg0 )
564 {
565 struct brw_compile *p = &c->func;
566 struct brw_instruction *if_insn;
567 struct brw_reg tmp = dst;
568 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
569
570 if (need_tmp)
571 tmp = get_tmp(c);
572
573 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
574 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
575
576 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
577 * to get all channels active inside the IF. In the clipping code
578 * we run with NoMask, so it's not an option and we can use
579 * BRW_EXECUTE_1 for all comparisions.
580 */
581 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
582 if_insn = brw_IF(p, BRW_EXECUTE_8);
583 {
584 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
585
586 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
587 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
588 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
589
590 emit_math2(c,
591 BRW_MATH_FUNCTION_POW,
592 brw_writemask(dst, WRITEMASK_Z),
593 brw_swizzle1(tmp, 2),
594 brw_swizzle1(arg0, 3),
595 BRW_MATH_PRECISION_PARTIAL);
596 }
597
598 brw_ENDIF(p, if_insn);
599 }
600
601
602
603
604
605 /* TODO: relative addressing!
606 */
607 static struct brw_reg get_reg( struct brw_vs_compile *c,
608 GLuint file,
609 GLuint index )
610 {
611
612 switch (file) {
613 case PROGRAM_TEMPORARY:
614 case PROGRAM_INPUT:
615 case PROGRAM_OUTPUT:
616 assert(c->regs[file][index].nr != 0);
617 return c->regs[file][index];
618 case PROGRAM_STATE_VAR:
619 case PROGRAM_CONSTANT:
620 case PROGRAM_UNIFORM:
621 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
622 return c->regs[PROGRAM_STATE_VAR][index];
623 case PROGRAM_ADDRESS:
624 assert(index == 0);
625 return c->regs[file][index];
626
627 case PROGRAM_UNDEFINED: /* undef values */
628 return brw_null_reg();
629
630 case PROGRAM_LOCAL_PARAM:
631 case PROGRAM_ENV_PARAM:
632 case PROGRAM_WRITE_ONLY:
633 default:
634 assert(0);
635 return brw_null_reg();
636 }
637 }
638
639
640
641 static struct brw_reg deref( struct brw_vs_compile *c,
642 struct brw_reg arg,
643 GLint offset)
644 {
645 struct brw_compile *p = &c->func;
646 struct brw_reg tmp = vec4(get_tmp(c));
647 struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
648 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
649 struct brw_reg indirect = brw_vec4_indirect(0,0);
650
651 {
652 brw_push_insn_state(p);
653 brw_set_access_mode(p, BRW_ALIGN_1);
654
655 /* This is pretty clunky - load the address register twice and
656 * fetch each 4-dword value in turn. There must be a way to do
657 * this in a single pass, but I couldn't get it to work.
658 */
659 brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
660 brw_MOV(p, tmp, indirect);
661
662 brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
663 brw_MOV(p, suboffset(tmp, 4), indirect);
664
665 brw_pop_insn_state(p);
666 }
667
668 return vec8(tmp);
669 }
670
671
672 static void emit_arl( struct brw_vs_compile *c,
673 struct brw_reg dst,
674 struct brw_reg arg0 )
675 {
676 struct brw_compile *p = &c->func;
677 struct brw_reg tmp = dst;
678 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
679
680 if (need_tmp)
681 tmp = get_tmp(c);
682
683 brw_RNDD(p, tmp, arg0);
684 brw_MUL(p, dst, tmp, brw_imm_d(16));
685
686 if (need_tmp)
687 release_tmp(c, tmp);
688 }
689
690
691 /* Will return mangled results for SWZ op. The emit_swz() function
692 * ignores this result and recalculates taking extended swizzles into
693 * account.
694 */
695 static struct brw_reg get_arg( struct brw_vs_compile *c,
696 struct prog_src_register *src )
697 {
698 struct brw_reg reg;
699
700 if (src->File == PROGRAM_UNDEFINED)
701 return brw_null_reg();
702
703 if (src->RelAddr)
704 reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
705 else
706 reg = get_reg(c, src->File, src->Index);
707
708 /* Convert 3-bit swizzle to 2-bit.
709 */
710 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
711 GET_SWZ(src->Swizzle, 1),
712 GET_SWZ(src->Swizzle, 2),
713 GET_SWZ(src->Swizzle, 3));
714
715 /* Note this is ok for non-swizzle instructions:
716 */
717 reg.negate = src->NegateBase ? 1 : 0;
718
719 return reg;
720 }
721
722
723 static struct brw_reg get_dst( struct brw_vs_compile *c,
724 struct prog_dst_register dst )
725 {
726 struct brw_reg reg = get_reg(c, dst.File, dst.Index);
727
728 reg.dw1.bits.writemask = dst.WriteMask;
729
730 return reg;
731 }
732
733
734
735
736 static void emit_swz( struct brw_vs_compile *c,
737 struct brw_reg dst,
738 struct prog_src_register src )
739 {
740 struct brw_compile *p = &c->func;
741 GLuint zeros_mask = 0;
742 GLuint ones_mask = 0;
743 GLuint src_mask = 0;
744 GLubyte src_swz[4];
745 GLboolean need_tmp = (src.NegateBase &&
746 dst.file != BRW_GENERAL_REGISTER_FILE);
747 struct brw_reg tmp = dst;
748 GLuint i;
749
750 if (need_tmp)
751 tmp = get_tmp(c);
752
753 for (i = 0; i < 4; i++) {
754 if (dst.dw1.bits.writemask & (1<<i)) {
755 GLubyte s = GET_SWZ(src.Swizzle, i);
756 switch (s) {
757 case SWIZZLE_X:
758 case SWIZZLE_Y:
759 case SWIZZLE_Z:
760 case SWIZZLE_W:
761 src_mask |= 1<<i;
762 src_swz[i] = s;
763 break;
764 case SWIZZLE_ZERO:
765 zeros_mask |= 1<<i;
766 break;
767 case SWIZZLE_ONE:
768 ones_mask |= 1<<i;
769 break;
770 }
771 }
772 }
773
774 /* Do src first, in case dst aliases src:
775 */
776 if (src_mask) {
777 struct brw_reg arg0;
778
779 if (src.RelAddr)
780 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
781 else
782 arg0 = get_reg(c, src.File, src.Index);
783
784 arg0 = brw_swizzle(arg0,
785 src_swz[0], src_swz[1],
786 src_swz[2], src_swz[3]);
787
788 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
789 }
790
791 if (zeros_mask)
792 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
793
794 if (ones_mask)
795 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
796
797 if (src.NegateBase)
798 brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
799
800 if (need_tmp) {
801 brw_MOV(p, dst, tmp);
802 release_tmp(c, tmp);
803 }
804 }
805
806
807
808 /* Post-vertex-program processing. Send the results to the URB.
809 */
810 static void emit_vertex_write( struct brw_vs_compile *c)
811 {
812 struct brw_compile *p = &c->func;
813 struct brw_reg m0 = brw_message_reg(0);
814 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
815 struct brw_reg ndc;
816
817 if (c->key.copy_edgeflag) {
818 brw_MOV(p,
819 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
820 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
821 }
822
823
824 /* Build ndc coords? TODO: Shortcircuit when w is known to be one.
825 */
826 if (!c->key.know_w_is_one) {
827 ndc = get_tmp(c);
828 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
829 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
830 }
831 else {
832 ndc = pos;
833 }
834
835 /* This includes the workaround for -ve rhw, so is no longer an
836 * optional step:
837 */
838 if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
839 c->key.nr_userclip ||
840 !c->key.know_w_is_one)
841 {
842 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
843 GLuint i;
844
845 brw_MOV(p, header1, brw_imm_ud(0));
846
847 brw_set_access_mode(p, BRW_ALIGN_16);
848
849 if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
850 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
851 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
852 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
853 }
854
855
856 for (i = 0; i < c->key.nr_userclip; i++) {
857 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
858 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
859 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
860 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
861 }
862
863
864 /* i965 clipping workaround:
865 * 1) Test for -ve rhw
866 * 2) If set,
867 * set ndc = (0,0,0,0)
868 * set ucp[6] = 1
869 *
870 * Later, clipping will detect ucp[6] and ensure the primitive is
871 * clipped against all fixed planes.
872 */
873 if (!c->key.know_w_is_one) {
874 brw_CMP(p,
875 vec8(brw_null_reg()),
876 BRW_CONDITIONAL_L,
877 brw_swizzle1(ndc, 3),
878 brw_imm_f(0));
879
880 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
881 brw_MOV(p, ndc, brw_imm_f(0));
882 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
883 }
884
885 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
886 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
887 brw_set_access_mode(p, BRW_ALIGN_16);
888
889 release_tmp(c, header1);
890 }
891 else {
892 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
893 }
894
895
896 /* Emit the (interleaved) headers for the two vertices - an 8-reg
897 * of zeros followed by two sets of NDC coordinates:
898 */
899 brw_set_access_mode(p, BRW_ALIGN_1);
900 brw_MOV(p, offset(m0, 2), ndc);
901 brw_MOV(p, offset(m0, 3), pos);
902
903
904 brw_urb_WRITE(p,
905 brw_null_reg(), /* dest */
906 0, /* starting mrf reg nr */
907 c->r0, /* src */
908 0, /* allocate */
909 1, /* used */
910 c->nr_outputs + 3, /* msg len */
911 0, /* response len */
912 1, /* eot */
913 1, /* writes complete */
914 0, /* urb destination offset */
915 BRW_URB_SWIZZLE_INTERLEAVE);
916
917 }
918
919 static void
920 post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
921 {
922 GLuint nr_insns = c->vp->program.Base.NumInstructions;
923 GLuint insn, target_insn;
924 struct prog_instruction *inst1, *inst2;
925 struct brw_instruction *brw_inst1, *brw_inst2;
926 int offset;
927 for (insn = 0; insn < nr_insns; insn++) {
928 inst1 = &c->vp->program.Base.Instructions[insn];
929 brw_inst1 = inst1->Data;
930 switch (inst1->Opcode) {
931 case OPCODE_CAL:
932 case OPCODE_BRA:
933 target_insn = inst1->BranchTarget;
934 inst2 = &c->vp->program.Base.Instructions[target_insn];
935 brw_inst2 = inst2->Data;
936 offset = brw_inst2 - brw_inst1;
937 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
938 break;
939 case OPCODE_END:
940 offset = end_inst - brw_inst1;
941 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
942 break;
943 default:
944 break;
945 }
946 }
947 }
948
949 /* Emit the fragment program instructions here.
950 */
951 void brw_vs_emit(struct brw_vs_compile *c )
952 {
953 #define MAX_IFSN 32
954 struct brw_compile *p = &c->func;
955 GLuint nr_insns = c->vp->program.Base.NumInstructions;
956 GLuint insn, if_insn = 0;
957 struct brw_instruction *end_inst;
958 struct brw_instruction *if_inst[MAX_IFSN];
959 struct brw_indirect stack_index = brw_indirect(0, 0);
960
961 GLuint index;
962 GLuint file;
963
964 if (INTEL_DEBUG & DEBUG_VS) {
965 _mesa_printf("\n\n\nvs-emit:\n");
966 _mesa_print_program(&c->vp->program.Base);
967 _mesa_printf("\n");
968 }
969
970 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
971 brw_set_access_mode(p, BRW_ALIGN_16);
972
973 /* Message registers can't be read, so copy the output into GRF register
974 if they are used in source registers */
975 for (insn = 0; insn < nr_insns; insn++) {
976 GLuint i;
977 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
978 for (i = 0; i < 3; i++) {
979 struct prog_src_register *src = &inst->SrcReg[i];
980 GLuint index = src->Index;
981 GLuint file = src->File;
982 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
983 c->output_regs[index].used_in_src = GL_TRUE;
984 }
985 }
986
987 /* Static register allocation
988 */
989 brw_vs_alloc_regs(c);
990 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
991
992 for (insn = 0; insn < nr_insns; insn++) {
993
994 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
995 struct brw_reg args[3], dst;
996 GLuint i;
997
998 /* Get argument regs. SWZ is special and does this itself.
999 */
1000 inst->Data = &p->store[p->nr_insn];
1001 if (inst->Opcode != OPCODE_SWZ)
1002 for (i = 0; i < 3; i++) {
1003 struct prog_src_register *src = &inst->SrcReg[i];
1004 index = src->Index;
1005 file = src->File;
1006 if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
1007 args[i] = c->output_regs[index].reg;
1008 else
1009 args[i] = get_arg(c, src);
1010 }
1011
1012 /* Get dest regs. Note that it is possible for a reg to be both
1013 * dst and arg, given the static allocation of registers. So
1014 * care needs to be taken emitting multi-operation instructions.
1015 */
1016 index = inst->DstReg.Index;
1017 file = inst->DstReg.File;
1018 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1019 dst = c->output_regs[index].reg;
1020 else
1021 dst = get_dst(c, inst->DstReg);
1022
1023 switch (inst->Opcode) {
1024 case OPCODE_ABS:
1025 brw_MOV(p, dst, brw_abs(args[0]));
1026 break;
1027 case OPCODE_ADD:
1028 brw_ADD(p, dst, args[0], args[1]);
1029 break;
1030 case OPCODE_DP3:
1031 brw_DP3(p, dst, args[0], args[1]);
1032 break;
1033 case OPCODE_DP4:
1034 brw_DP4(p, dst, args[0], args[1]);
1035 break;
1036 case OPCODE_DPH:
1037 brw_DPH(p, dst, args[0], args[1]);
1038 break;
1039 case OPCODE_DST:
1040 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1041 break;
1042 case OPCODE_EXP:
1043 unalias1(c, dst, args[0], emit_exp_noalias);
1044 break;
1045 case OPCODE_EX2:
1046 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1047 break;
1048 case OPCODE_ARL:
1049 emit_arl(c, dst, args[0]);
1050 break;
1051 case OPCODE_FLR:
1052 brw_RNDD(p, dst, args[0]);
1053 break;
1054 case OPCODE_FRC:
1055 brw_FRC(p, dst, args[0]);
1056 break;
1057 case OPCODE_LOG:
1058 unalias1(c, dst, args[0], emit_log_noalias);
1059 break;
1060 case OPCODE_LG2:
1061 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1062 break;
1063 case OPCODE_LIT:
1064 unalias1(c, dst, args[0], emit_lit_noalias);
1065 break;
1066 case OPCODE_MAD:
1067 brw_MOV(p, brw_acc_reg(), args[2]);
1068 brw_MAC(p, dst, args[0], args[1]);
1069 break;
1070 case OPCODE_MAX:
1071 emit_max(p, dst, args[0], args[1]);
1072 break;
1073 case OPCODE_MIN:
1074 emit_min(p, dst, args[0], args[1]);
1075 break;
1076 case OPCODE_MOV:
1077 brw_MOV(p, dst, args[0]);
1078 break;
1079 case OPCODE_MUL:
1080 brw_MUL(p, dst, args[0], args[1]);
1081 break;
1082 case OPCODE_POW:
1083 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1084 break;
1085 case OPCODE_RCP:
1086 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1087 break;
1088 case OPCODE_RSQ:
1089 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
1090 break;
1091
1092 case OPCODE_SEQ:
1093 emit_seq(p, dst, args[0], args[1]);
1094 break;
1095 case OPCODE_SNE:
1096 emit_sne(p, dst, args[0], args[1]);
1097 break;
1098 case OPCODE_SGE:
1099 emit_sge(p, dst, args[0], args[1]);
1100 break;
1101 case OPCODE_SGT:
1102 emit_sgt(p, dst, args[0], args[1]);
1103 break;
1104 case OPCODE_SLT:
1105 emit_slt(p, dst, args[0], args[1]);
1106 break;
1107 case OPCODE_SLE:
1108 emit_sle(p, dst, args[0], args[1]);
1109 break;
1110 case OPCODE_SUB:
1111 brw_ADD(p, dst, args[0], negate(args[1]));
1112 break;
1113 case OPCODE_SWZ:
1114 /* The args[0] value can't be used here as it won't have
1115 * correctly encoded the full swizzle:
1116 */
1117 emit_swz(c, dst, inst->SrcReg[0] );
1118 break;
1119 case OPCODE_XPD:
1120 emit_xpd(p, dst, args[0], args[1]);
1121 break;
1122 case OPCODE_IF:
1123 assert(if_insn < MAX_IFSN);
1124 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
1125 break;
1126 case OPCODE_ELSE:
1127 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
1128 break;
1129 case OPCODE_ENDIF:
1130 assert(if_insn > 0);
1131 brw_ENDIF(p, if_inst[--if_insn]);
1132 break;
1133 case OPCODE_BRA:
1134 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1135 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1136 brw_set_predicate_control_flag_value(p, 0xff);
1137 break;
1138 case OPCODE_CAL:
1139 brw_set_access_mode(p, BRW_ALIGN_1);
1140 brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
1141 brw_set_access_mode(p, BRW_ALIGN_16);
1142 brw_ADD(p, get_addr_reg(stack_index),
1143 get_addr_reg(stack_index), brw_imm_d(4));
1144 inst->Data = &p->store[p->nr_insn];
1145 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1146 break;
1147 case OPCODE_RET:
1148 brw_ADD(p, get_addr_reg(stack_index),
1149 get_addr_reg(stack_index), brw_imm_d(-4));
1150 brw_set_access_mode(p, BRW_ALIGN_1);
1151 brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
1152 brw_set_access_mode(p, BRW_ALIGN_16);
1153 case OPCODE_END:
1154 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
1155 break;
1156 case OPCODE_PRINT:
1157 case OPCODE_BGNSUB:
1158 case OPCODE_ENDSUB:
1159 break;
1160 default:
1161 _mesa_printf("Unsupport opcode %d in vertex shader\n", inst->Opcode);
1162 break;
1163 }
1164
1165 if (inst->DstReg.File == PROGRAM_OUTPUT
1166 &&inst->DstReg.Index != VERT_RESULT_HPOS
1167 &&c->output_regs[inst->DstReg.Index].used_in_src)
1168 brw_MOV(p, get_dst(c, inst->DstReg), dst);
1169
1170 release_tmps(c);
1171 }
1172
1173 end_inst = &p->store[p->nr_insn];
1174 emit_vertex_write(c);
1175 post_vs_emit(c, end_inst);
1176 for (insn = 0; insn < nr_insns; insn++)
1177 c->vp->program.Base.Instructions[insn].Data = NULL;
1178 }