i965: Add support for gen6 reladdr VS constant loading.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146 int max_constant;
147 int constant = 0;
148
149 /* Determine whether to use a real constant buffer or use a block
150 * of GRF registers for constants. The later is faster but only
151 * works if everything fits in the GRF.
152 * XXX this heuristic/check may need some fine tuning...
153 */
154 if (c->vp->program.Base.Parameters->NumParameters +
155 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
156 c->vp->use_const_buffer = GL_TRUE;
157 else
158 c->vp->use_const_buffer = GL_FALSE;
159
160 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
161
162 /* r0 -- reserved as usual
163 */
164 c->r0 = brw_vec8_grf(reg, 0);
165 reg++;
166
167 /* User clip planes from curbe:
168 */
169 if (c->key.nr_userclip) {
170 if (intel->gen >= 6) {
171 for (i = 0; i < c->key.nr_userclip; i++) {
172 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
173 (i % 2) * 4), 0, 4, 1);
174 }
175 reg += ALIGN(c->key.nr_userclip, 2) / 2;
176 } else {
177 for (i = 0; i < c->key.nr_userclip; i++) {
178 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
179 (i % 2) * 4), 0, 4, 1);
180 }
181 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
182 }
183
184 }
185
186 /* Assign some (probably all) of the vertex program constants to
187 * the push constant buffer/CURBE.
188 *
189 * There's an obvious limit to the numer of push constants equal to
190 * the number of register available, and that number is smaller
191 * than the minimum maximum number of vertex program parameters, so
192 * support for pull constants is required if we overflow.
193 * Additionally, on gen6 the number of push constants is even
194 * lower.
195 *
196 * When there's relative addressing, we don't know what range of
197 * Mesa IR registers can be accessed. And generally, when relative
198 * addressing is used we also have too many constants to load them
199 * all as push constants. So, we'll just support relative
200 * addressing out of the pull constant buffers, and try to load as
201 * many statically-accessed constants into the push constant buffer
202 * as we can.
203 */
204 if (intel->gen >= 6) {
205 /* We can only load 32 regs of push constants. */
206 max_constant = 32 * 2 - c->key.nr_userclip;
207 } else {
208 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
209 }
210
211 /* constant_map maps from ParameterValues[] index to index in the
212 * push constant buffer, or -1 if it's only in the pull constant
213 * buffer.
214 */
215 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
216 for (i = 0;
217 i < c->vp->program.Base.NumInstructions && constant < max_constant;
218 i++) {
219 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
220 int arg;
221
222 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
223 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
224 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
225 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
226 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
227 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
228 continue;
229 }
230
231 if (inst->SrcReg[arg].RelAddr) {
232 c->vp->use_const_buffer = GL_TRUE;
233 continue;
234 }
235
236 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
237 c->constant_map[inst->SrcReg[arg].Index] = constant++;
238 }
239 }
240 }
241
242 /* If we ran out of push constant space, then we'll also upload all
243 * constants through the pull constant buffer so that they can be
244 * accessed no matter what. For relative addressing (the common
245 * case) we need them all in place anyway.
246 */
247 if (constant == max_constant)
248 c->vp->use_const_buffer = GL_TRUE;
249
250 for (i = 0; i < constant; i++) {
251 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
252 (i % 2) * 4),
253 0, 4, 1);
254 }
255 reg += (constant + 1) / 2;
256 c->prog_data.curb_read_length = reg - 1;
257 c->prog_data.nr_params = constant * 4;
258 /* XXX 0 causes a bug elsewhere... */
259 if (intel->gen < 6 && c->prog_data.nr_params == 0)
260 c->prog_data.nr_params = 4;
261
262 /* Allocate input regs:
263 */
264 c->nr_inputs = 0;
265 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
266 if (c->prog_data.inputs_read & (1 << i)) {
267 c->nr_inputs++;
268 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
269 reg++;
270 }
271 }
272 /* If there are no inputs, we'll still be reading one attribute's worth
273 * because it's required -- see urb_read_length setting.
274 */
275 if (c->nr_inputs == 0)
276 reg++;
277
278 /* Allocate outputs. The non-position outputs go straight into message regs.
279 */
280 c->nr_outputs = 0;
281 c->first_output = reg;
282 c->first_overflow_output = 0;
283
284 if (intel->gen >= 6) {
285 mrf = 3;
286 if (c->key.nr_userclip)
287 mrf += 2;
288 } else if (intel->gen == 5)
289 mrf = 8;
290 else
291 mrf = 4;
292
293 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
294 for (i = 0; i < VERT_RESULT_MAX; i++) {
295 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
296 c->nr_outputs++;
297 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
298 if (i == VERT_RESULT_HPOS) {
299 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
300 reg++;
301 }
302 else if (i == VERT_RESULT_PSIZ) {
303 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
304 reg++;
305 }
306 else {
307 /* Two restrictions on our compute-to-MRF here. The
308 * message length for all SEND messages is restricted to
309 * [1,15], so we can't use mrf 15, as that means a length
310 * of 16.
311 *
312 * Additionally, URB writes are aligned to URB rows, so we
313 * need to put an even number of registers of URB data in
314 * each URB write so that the later write is aligned. A
315 * message length of 15 means 1 message header reg plus 14
316 * regs of URB data.
317 *
318 * For attributes beyond the compute-to-MRF, we compute to
319 * GRFs and they will be written in the second URB_WRITE.
320 */
321 if (first_reladdr_output > i && mrf < 15) {
322 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
323 mrf++;
324 }
325 else {
326 if (mrf >= 15 && !c->first_overflow_output)
327 c->first_overflow_output = i;
328 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
329 reg++;
330 mrf++;
331 }
332 }
333 }
334 }
335
336 /* Allocate program temporaries:
337 */
338 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
339 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
340 reg++;
341 }
342
343 /* Address reg(s). Don't try to use the internal address reg until
344 * deref time.
345 */
346 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
347 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
348 reg,
349 0,
350 BRW_REGISTER_TYPE_D,
351 BRW_VERTICAL_STRIDE_8,
352 BRW_WIDTH_8,
353 BRW_HORIZONTAL_STRIDE_1,
354 BRW_SWIZZLE_XXXX,
355 WRITEMASK_X);
356 reg++;
357 }
358
359 if (c->vp->use_const_buffer) {
360 for (i = 0; i < 3; i++) {
361 c->current_const[i].reg = brw_vec8_grf(reg, 0);
362 reg++;
363 }
364 clear_current_const(c);
365 }
366
367 for (i = 0; i < 128; i++) {
368 if (c->output_regs[i].used_in_src) {
369 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
370 reg++;
371 }
372 }
373
374 if (c->needs_stack) {
375 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
376 reg += 2;
377 }
378
379 /* Some opcodes need an internal temporary:
380 */
381 c->first_tmp = reg;
382 c->last_tmp = reg; /* for allocation purposes */
383
384 /* Each input reg holds data from two vertices. The
385 * urb_read_length is the number of registers read from *each*
386 * vertex urb, so is half the amount:
387 */
388 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
389 /* Setting this field to 0 leads to undefined behavior according to the
390 * the VS_STATE docs. Our VUEs will always have at least one attribute
391 * sitting in them, even if it's padding.
392 */
393 if (c->prog_data.urb_read_length == 0)
394 c->prog_data.urb_read_length = 1;
395
396 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
397 * them to fit the biggest thing they need to.
398 */
399 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
400
401 /* See emit_vertex_write() for where the VUE's overhead on top of the
402 * attributes comes from.
403 */
404 if (intel->gen >= 6) {
405 int header_regs = 2;
406 if (c->key.nr_userclip)
407 header_regs += 2;
408
409 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
410 } else if (intel->gen == 5)
411 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
412 else
413 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
414
415 c->prog_data.total_grf = reg;
416
417 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
418 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
419 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
420 printf("%s reg = %d\n", __FUNCTION__, reg);
421 }
422 }
423
424
425 /**
426 * If an instruction uses a temp reg both as a src and the dest, we
427 * sometimes need to allocate an intermediate temporary.
428 */
429 static void unalias1( struct brw_vs_compile *c,
430 struct brw_reg dst,
431 struct brw_reg arg0,
432 void (*func)( struct brw_vs_compile *,
433 struct brw_reg,
434 struct brw_reg ))
435 {
436 if (dst.file == arg0.file && dst.nr == arg0.nr) {
437 struct brw_compile *p = &c->func;
438 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
439 func(c, tmp, arg0);
440 brw_MOV(p, dst, tmp);
441 release_tmp(c, tmp);
442 }
443 else {
444 func(c, dst, arg0);
445 }
446 }
447
448 /**
449 * \sa unalias2
450 * Checkes if 2-operand instruction needs an intermediate temporary.
451 */
452 static void unalias2( struct brw_vs_compile *c,
453 struct brw_reg dst,
454 struct brw_reg arg0,
455 struct brw_reg arg1,
456 void (*func)( struct brw_vs_compile *,
457 struct brw_reg,
458 struct brw_reg,
459 struct brw_reg ))
460 {
461 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
462 (dst.file == arg1.file && dst.nr == arg1.nr)) {
463 struct brw_compile *p = &c->func;
464 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
465 func(c, tmp, arg0, arg1);
466 brw_MOV(p, dst, tmp);
467 release_tmp(c, tmp);
468 }
469 else {
470 func(c, dst, arg0, arg1);
471 }
472 }
473
474 /**
475 * \sa unalias2
476 * Checkes if 3-operand instruction needs an intermediate temporary.
477 */
478 static void unalias3( struct brw_vs_compile *c,
479 struct brw_reg dst,
480 struct brw_reg arg0,
481 struct brw_reg arg1,
482 struct brw_reg arg2,
483 void (*func)( struct brw_vs_compile *,
484 struct brw_reg,
485 struct brw_reg,
486 struct brw_reg,
487 struct brw_reg ))
488 {
489 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
490 (dst.file == arg1.file && dst.nr == arg1.nr) ||
491 (dst.file == arg2.file && dst.nr == arg2.nr)) {
492 struct brw_compile *p = &c->func;
493 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
494 func(c, tmp, arg0, arg1, arg2);
495 brw_MOV(p, dst, tmp);
496 release_tmp(c, tmp);
497 }
498 else {
499 func(c, dst, arg0, arg1, arg2);
500 }
501 }
502
503 static void emit_sop( struct brw_vs_compile *c,
504 struct brw_reg dst,
505 struct brw_reg arg0,
506 struct brw_reg arg1,
507 GLuint cond)
508 {
509 struct brw_compile *p = &c->func;
510
511 brw_MOV(p, dst, brw_imm_f(0.0f));
512 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
513 brw_MOV(p, dst, brw_imm_f(1.0f));
514 brw_set_predicate_control_flag_value(p, 0xff);
515 }
516
517 static void emit_seq( struct brw_vs_compile *c,
518 struct brw_reg dst,
519 struct brw_reg arg0,
520 struct brw_reg arg1 )
521 {
522 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
523 }
524
525 static void emit_sne( struct brw_vs_compile *c,
526 struct brw_reg dst,
527 struct brw_reg arg0,
528 struct brw_reg arg1 )
529 {
530 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
531 }
532 static void emit_slt( struct brw_vs_compile *c,
533 struct brw_reg dst,
534 struct brw_reg arg0,
535 struct brw_reg arg1 )
536 {
537 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
538 }
539
540 static void emit_sle( struct brw_vs_compile *c,
541 struct brw_reg dst,
542 struct brw_reg arg0,
543 struct brw_reg arg1 )
544 {
545 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
546 }
547
548 static void emit_sgt( struct brw_vs_compile *c,
549 struct brw_reg dst,
550 struct brw_reg arg0,
551 struct brw_reg arg1 )
552 {
553 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
554 }
555
556 static void emit_sge( struct brw_vs_compile *c,
557 struct brw_reg dst,
558 struct brw_reg arg0,
559 struct brw_reg arg1 )
560 {
561 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
562 }
563
564 static void emit_cmp( struct brw_compile *p,
565 struct brw_reg dst,
566 struct brw_reg arg0,
567 struct brw_reg arg1,
568 struct brw_reg arg2 )
569 {
570 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
571 brw_SEL(p, dst, arg1, arg2);
572 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
573 }
574
575 static void emit_sign(struct brw_vs_compile *c,
576 struct brw_reg dst,
577 struct brw_reg arg0)
578 {
579 struct brw_compile *p = &c->func;
580
581 brw_MOV(p, dst, brw_imm_f(0));
582
583 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
584 brw_MOV(p, dst, brw_imm_f(-1.0));
585 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
586
587 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
588 brw_MOV(p, dst, brw_imm_f(1.0));
589 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
590 }
591
592 static void emit_max( struct brw_compile *p,
593 struct brw_reg dst,
594 struct brw_reg arg0,
595 struct brw_reg arg1 )
596 {
597 struct intel_context *intel = &p->brw->intel;
598
599 if (intel->gen >= 6) {
600 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
601 brw_SEL(p, dst, arg0, arg1);
602 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
603 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
604 } else {
605 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
606 brw_SEL(p, dst, arg0, arg1);
607 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
608 }
609 }
610
611 static void emit_min( struct brw_compile *p,
612 struct brw_reg dst,
613 struct brw_reg arg0,
614 struct brw_reg arg1 )
615 {
616 struct intel_context *intel = &p->brw->intel;
617
618 if (intel->gen >= 6) {
619 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
620 brw_SEL(p, dst, arg0, arg1);
621 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
622 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
623 } else {
624 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
625 brw_SEL(p, dst, arg0, arg1);
626 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
627 }
628 }
629
630 static void emit_math1_gen4(struct brw_vs_compile *c,
631 GLuint function,
632 struct brw_reg dst,
633 struct brw_reg arg0,
634 GLuint precision)
635 {
636 /* There are various odd behaviours with SEND on the simulator. In
637 * addition there are documented issues with the fact that the GEN4
638 * processor doesn't do dependency control properly on SEND
639 * results. So, on balance, this kludge to get around failures
640 * with writemasked math results looks like it might be necessary
641 * whether that turns out to be a simulator bug or not:
642 */
643 struct brw_compile *p = &c->func;
644 struct brw_reg tmp = dst;
645 GLboolean need_tmp = GL_FALSE;
646
647 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
648 dst.dw1.bits.writemask != 0xf)
649 need_tmp = GL_TRUE;
650
651 if (need_tmp)
652 tmp = get_tmp(c);
653
654 brw_math(p,
655 tmp,
656 function,
657 BRW_MATH_SATURATE_NONE,
658 2,
659 arg0,
660 BRW_MATH_DATA_SCALAR,
661 precision);
662
663 if (need_tmp) {
664 brw_MOV(p, dst, tmp);
665 release_tmp(c, tmp);
666 }
667 }
668
669 static void
670 emit_math1_gen6(struct brw_vs_compile *c,
671 GLuint function,
672 struct brw_reg dst,
673 struct brw_reg arg0,
674 GLuint precision)
675 {
676 struct brw_compile *p = &c->func;
677 struct brw_reg tmp_src, tmp_dst;
678
679 /* Something is strange on gen6 math in 16-wide mode, though the
680 * docs say it's supposed to work. Punt to using align1 mode,
681 * which doesn't do writemasking and swizzles.
682 */
683 tmp_src = get_tmp(c);
684 tmp_dst = get_tmp(c);
685
686 brw_MOV(p, tmp_src, arg0);
687
688 brw_set_access_mode(p, BRW_ALIGN_1);
689 brw_math(p,
690 tmp_dst,
691 function,
692 BRW_MATH_SATURATE_NONE,
693 2,
694 tmp_src,
695 BRW_MATH_DATA_SCALAR,
696 precision);
697 brw_set_access_mode(p, BRW_ALIGN_16);
698
699 brw_MOV(p, dst, tmp_dst);
700
701 release_tmp(c, tmp_src);
702 release_tmp(c, tmp_dst);
703 }
704
705 static void
706 emit_math1(struct brw_vs_compile *c,
707 GLuint function,
708 struct brw_reg dst,
709 struct brw_reg arg0,
710 GLuint precision)
711 {
712 struct brw_compile *p = &c->func;
713 struct intel_context *intel = &p->brw->intel;
714
715 if (intel->gen >= 6)
716 emit_math1_gen6(c, function, dst, arg0, precision);
717 else
718 emit_math1_gen4(c, function, dst, arg0, precision);
719 }
720
721 static void emit_math2_gen4( struct brw_vs_compile *c,
722 GLuint function,
723 struct brw_reg dst,
724 struct brw_reg arg0,
725 struct brw_reg arg1,
726 GLuint precision)
727 {
728 struct brw_compile *p = &c->func;
729 struct brw_reg tmp = dst;
730 GLboolean need_tmp = GL_FALSE;
731
732 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
733 dst.dw1.bits.writemask != 0xf)
734 need_tmp = GL_TRUE;
735
736 if (need_tmp)
737 tmp = get_tmp(c);
738
739 brw_MOV(p, brw_message_reg(3), arg1);
740
741 brw_math(p,
742 tmp,
743 function,
744 BRW_MATH_SATURATE_NONE,
745 2,
746 arg0,
747 BRW_MATH_DATA_SCALAR,
748 precision);
749
750 if (need_tmp) {
751 brw_MOV(p, dst, tmp);
752 release_tmp(c, tmp);
753 }
754 }
755
756 static void emit_math2_gen6( struct brw_vs_compile *c,
757 GLuint function,
758 struct brw_reg dst,
759 struct brw_reg arg0,
760 struct brw_reg arg1,
761 GLuint precision)
762 {
763 struct brw_compile *p = &c->func;
764 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
765
766 tmp_src0 = get_tmp(c);
767 tmp_src1 = get_tmp(c);
768 tmp_dst = get_tmp(c);
769
770 brw_MOV(p, tmp_src0, arg0);
771 brw_MOV(p, tmp_src1, arg1);
772
773 brw_set_access_mode(p, BRW_ALIGN_1);
774 brw_math2(p,
775 tmp_dst,
776 function,
777 tmp_src0,
778 tmp_src1);
779 brw_set_access_mode(p, BRW_ALIGN_16);
780
781 brw_MOV(p, dst, tmp_dst);
782
783 release_tmp(c, tmp_src0);
784 release_tmp(c, tmp_src1);
785 release_tmp(c, tmp_dst);
786 }
787
788 static void emit_math2( struct brw_vs_compile *c,
789 GLuint function,
790 struct brw_reg dst,
791 struct brw_reg arg0,
792 struct brw_reg arg1,
793 GLuint precision)
794 {
795 struct brw_compile *p = &c->func;
796 struct intel_context *intel = &p->brw->intel;
797
798 if (intel->gen >= 6)
799 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
800 else
801 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
802 }
803
804 static void emit_exp_noalias( struct brw_vs_compile *c,
805 struct brw_reg dst,
806 struct brw_reg arg0 )
807 {
808 struct brw_compile *p = &c->func;
809
810
811 if (dst.dw1.bits.writemask & WRITEMASK_X) {
812 struct brw_reg tmp = get_tmp(c);
813 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
814
815 /* tmp_d = floor(arg0.x) */
816 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
817
818 /* result[0] = 2.0 ^ tmp */
819
820 /* Adjust exponent for floating point:
821 * exp += 127
822 */
823 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
824
825 /* Install exponent and sign.
826 * Excess drops off the edge:
827 */
828 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
829 tmp_d, brw_imm_d(23));
830
831 release_tmp(c, tmp);
832 }
833
834 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
835 /* result[1] = arg0.x - floor(arg0.x) */
836 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
837 }
838
839 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
840 /* As with the LOG instruction, we might be better off just
841 * doing a taylor expansion here, seeing as we have to do all
842 * the prep work.
843 *
844 * If mathbox partial precision is too low, consider also:
845 * result[3] = result[0] * EXP(result[1])
846 */
847 emit_math1(c,
848 BRW_MATH_FUNCTION_EXP,
849 brw_writemask(dst, WRITEMASK_Z),
850 brw_swizzle1(arg0, 0),
851 BRW_MATH_PRECISION_FULL);
852 }
853
854 if (dst.dw1.bits.writemask & WRITEMASK_W) {
855 /* result[3] = 1.0; */
856 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
857 }
858 }
859
860
861 static void emit_log_noalias( struct brw_vs_compile *c,
862 struct brw_reg dst,
863 struct brw_reg arg0 )
864 {
865 struct brw_compile *p = &c->func;
866 struct brw_reg tmp = dst;
867 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
868 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
869 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
870 dst.file != BRW_GENERAL_REGISTER_FILE);
871
872 if (need_tmp) {
873 tmp = get_tmp(c);
874 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
875 }
876
877 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
878 * according to spec:
879 *
880 * These almost look likey they could be joined up, but not really
881 * practical:
882 *
883 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
884 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
885 */
886 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
887 brw_AND(p,
888 brw_writemask(tmp_ud, WRITEMASK_X),
889 brw_swizzle1(arg0_ud, 0),
890 brw_imm_ud((1U<<31)-1));
891
892 brw_SHR(p,
893 brw_writemask(tmp_ud, WRITEMASK_X),
894 tmp_ud,
895 brw_imm_ud(23));
896
897 brw_ADD(p,
898 brw_writemask(tmp, WRITEMASK_X),
899 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
900 brw_imm_d(-127));
901 }
902
903 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
904 brw_AND(p,
905 brw_writemask(tmp_ud, WRITEMASK_Y),
906 brw_swizzle1(arg0_ud, 0),
907 brw_imm_ud((1<<23)-1));
908
909 brw_OR(p,
910 brw_writemask(tmp_ud, WRITEMASK_Y),
911 tmp_ud,
912 brw_imm_ud(127<<23));
913 }
914
915 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
916 /* result[2] = result[0] + LOG2(result[1]); */
917
918 /* Why bother? The above is just a hint how to do this with a
919 * taylor series. Maybe we *should* use a taylor series as by
920 * the time all the above has been done it's almost certainly
921 * quicker than calling the mathbox, even with low precision.
922 *
923 * Options are:
924 * - result[0] + mathbox.LOG2(result[1])
925 * - mathbox.LOG2(arg0.x)
926 * - result[0] + inline_taylor_approx(result[1])
927 */
928 emit_math1(c,
929 BRW_MATH_FUNCTION_LOG,
930 brw_writemask(tmp, WRITEMASK_Z),
931 brw_swizzle1(tmp, 1),
932 BRW_MATH_PRECISION_FULL);
933
934 brw_ADD(p,
935 brw_writemask(tmp, WRITEMASK_Z),
936 brw_swizzle1(tmp, 2),
937 brw_swizzle1(tmp, 0));
938 }
939
940 if (dst.dw1.bits.writemask & WRITEMASK_W) {
941 /* result[3] = 1.0; */
942 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
943 }
944
945 if (need_tmp) {
946 brw_MOV(p, dst, tmp);
947 release_tmp(c, tmp);
948 }
949 }
950
951
952 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
953 */
954 static void emit_dst_noalias( struct brw_vs_compile *c,
955 struct brw_reg dst,
956 struct brw_reg arg0,
957 struct brw_reg arg1)
958 {
959 struct brw_compile *p = &c->func;
960
961 /* There must be a better way to do this:
962 */
963 if (dst.dw1.bits.writemask & WRITEMASK_X)
964 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
965 if (dst.dw1.bits.writemask & WRITEMASK_Y)
966 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
967 if (dst.dw1.bits.writemask & WRITEMASK_Z)
968 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
969 if (dst.dw1.bits.writemask & WRITEMASK_W)
970 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
971 }
972
973
974 static void emit_xpd( struct brw_compile *p,
975 struct brw_reg dst,
976 struct brw_reg t,
977 struct brw_reg u)
978 {
979 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
980 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
981 }
982
983
984 static void emit_lit_noalias( struct brw_vs_compile *c,
985 struct brw_reg dst,
986 struct brw_reg arg0 )
987 {
988 struct brw_compile *p = &c->func;
989 struct brw_instruction *if_insn;
990 struct brw_reg tmp = dst;
991 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
992
993 if (need_tmp)
994 tmp = get_tmp(c);
995
996 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
997 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
998
999 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1000 * to get all channels active inside the IF. In the clipping code
1001 * we run with NoMask, so it's not an option and we can use
1002 * BRW_EXECUTE_1 for all comparisions.
1003 */
1004 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1005 if_insn = brw_IF(p, BRW_EXECUTE_8);
1006 {
1007 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1008
1009 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1010 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1011 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1012
1013 emit_math2(c,
1014 BRW_MATH_FUNCTION_POW,
1015 brw_writemask(dst, WRITEMASK_Z),
1016 brw_swizzle1(tmp, 2),
1017 brw_swizzle1(arg0, 3),
1018 BRW_MATH_PRECISION_PARTIAL);
1019 }
1020
1021 brw_ENDIF(p, if_insn);
1022
1023 release_tmp(c, tmp);
1024 }
1025
1026 static void emit_lrp_noalias(struct brw_vs_compile *c,
1027 struct brw_reg dst,
1028 struct brw_reg arg0,
1029 struct brw_reg arg1,
1030 struct brw_reg arg2)
1031 {
1032 struct brw_compile *p = &c->func;
1033
1034 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1035 brw_MUL(p, brw_null_reg(), dst, arg2);
1036 brw_MAC(p, dst, arg0, arg1);
1037 }
1038
1039 /** 3 or 4-component vector normalization */
1040 static void emit_nrm( struct brw_vs_compile *c,
1041 struct brw_reg dst,
1042 struct brw_reg arg0,
1043 int num_comps)
1044 {
1045 struct brw_compile *p = &c->func;
1046 struct brw_reg tmp = get_tmp(c);
1047
1048 /* tmp = dot(arg0, arg0) */
1049 if (num_comps == 3)
1050 brw_DP3(p, tmp, arg0, arg0);
1051 else
1052 brw_DP4(p, tmp, arg0, arg0);
1053
1054 /* tmp = 1 / sqrt(tmp) */
1055 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1056
1057 /* dst = arg0 * tmp */
1058 brw_MUL(p, dst, arg0, tmp);
1059
1060 release_tmp(c, tmp);
1061 }
1062
1063
1064 static struct brw_reg
1065 get_constant(struct brw_vs_compile *c,
1066 const struct prog_instruction *inst,
1067 GLuint argIndex)
1068 {
1069 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1070 struct brw_compile *p = &c->func;
1071 struct brw_reg const_reg = c->current_const[argIndex].reg;
1072
1073 assert(argIndex < 3);
1074
1075 if (c->current_const[argIndex].index != src->Index) {
1076 /* Keep track of the last constant loaded in this slot, for reuse. */
1077 c->current_const[argIndex].index = src->Index;
1078
1079 #if 0
1080 printf(" fetch const[%d] for arg %d into reg %d\n",
1081 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1082 #endif
1083 /* need to fetch the constant now */
1084 brw_dp_READ_4_vs(p,
1085 const_reg, /* writeback dest */
1086 16 * src->Index, /* byte offset */
1087 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1088 );
1089 }
1090
1091 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1092 const_reg = stride(const_reg, 0, 4, 0);
1093 const_reg.subnr = 0;
1094
1095 return const_reg;
1096 }
1097
1098 static struct brw_reg
1099 get_reladdr_constant(struct brw_vs_compile *c,
1100 const struct prog_instruction *inst,
1101 GLuint argIndex)
1102 {
1103 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1104 struct brw_compile *p = &c->func;
1105 struct brw_context *brw = p->brw;
1106 struct intel_context *intel = &brw->intel;
1107 struct brw_reg const_reg = c->current_const[argIndex].reg;
1108 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1109 uint32_t offset;
1110
1111 assert(argIndex < 3);
1112
1113 /* Can't reuse a reladdr constant load. */
1114 c->current_const[argIndex].index = -1;
1115
1116 #if 0
1117 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1118 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1119 #endif
1120
1121 if (intel->gen >= 6) {
1122 offset = src->Index;
1123 } else {
1124 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1125 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1126 addr_reg = byte_addr_reg;
1127 offset = 16 * src->Index;
1128 }
1129
1130 /* fetch the first vec4 */
1131 brw_dp_READ_4_vs_relative(p,
1132 const_reg,
1133 addr_reg,
1134 offset,
1135 SURF_INDEX_VERT_CONST_BUFFER);
1136
1137 return const_reg;
1138 }
1139
1140
1141
1142 /* TODO: relative addressing!
1143 */
1144 static struct brw_reg get_reg( struct brw_vs_compile *c,
1145 gl_register_file file,
1146 GLuint index )
1147 {
1148 switch (file) {
1149 case PROGRAM_TEMPORARY:
1150 case PROGRAM_INPUT:
1151 case PROGRAM_OUTPUT:
1152 assert(c->regs[file][index].nr != 0);
1153 return c->regs[file][index];
1154 case PROGRAM_STATE_VAR:
1155 case PROGRAM_CONSTANT:
1156 case PROGRAM_UNIFORM:
1157 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1158 return c->regs[PROGRAM_STATE_VAR][index];
1159 case PROGRAM_ADDRESS:
1160 assert(index == 0);
1161 return c->regs[file][index];
1162
1163 case PROGRAM_UNDEFINED: /* undef values */
1164 return brw_null_reg();
1165
1166 case PROGRAM_LOCAL_PARAM:
1167 case PROGRAM_ENV_PARAM:
1168 case PROGRAM_WRITE_ONLY:
1169 default:
1170 assert(0);
1171 return brw_null_reg();
1172 }
1173 }
1174
1175
1176 /**
1177 * Indirect addressing: get reg[[arg] + offset].
1178 */
1179 static struct brw_reg deref( struct brw_vs_compile *c,
1180 struct brw_reg arg,
1181 GLint offset,
1182 GLuint reg_size )
1183 {
1184 struct brw_compile *p = &c->func;
1185 struct brw_reg tmp = get_tmp(c);
1186 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1187 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1188 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1189 struct brw_reg indirect = brw_vec4_indirect(0,0);
1190 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1191
1192 /* Set the vertical stride on the register access so that the first
1193 * 4 components come from a0.0 and the second 4 from a0.1.
1194 */
1195 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1196
1197 {
1198 brw_push_insn_state(p);
1199 brw_set_access_mode(p, BRW_ALIGN_1);
1200
1201 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1202 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1203
1204 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1205 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1206
1207 brw_MOV(p, tmp, indirect);
1208
1209 brw_pop_insn_state(p);
1210 }
1211
1212 /* NOTE: tmp not released */
1213 return tmp;
1214 }
1215
1216 static void
1217 move_to_reladdr_dst(struct brw_vs_compile *c,
1218 const struct prog_instruction *inst,
1219 struct brw_reg val)
1220 {
1221 struct brw_compile *p = &c->func;
1222 int reg_size = 32;
1223 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1224 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1225 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1226 GLuint byte_offset = base.nr * 32 + base.subnr;
1227 struct brw_reg indirect = brw_vec4_indirect(0,0);
1228 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1229
1230 /* Because destination register indirect addressing can only use
1231 * one index, we'll write each vertex's vec4 value separately.
1232 */
1233 val.width = BRW_WIDTH_4;
1234 val.vstride = BRW_VERTICAL_STRIDE_4;
1235
1236 brw_push_insn_state(p);
1237 brw_set_access_mode(p, BRW_ALIGN_1);
1238
1239 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1240 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1241 brw_MOV(p, indirect, val);
1242
1243 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1244 brw_ADD(p, brw_address_reg(0), acc,
1245 brw_imm_uw(byte_offset + reg_size / 2));
1246 brw_MOV(p, indirect, suboffset(val, 4));
1247
1248 brw_pop_insn_state(p);
1249 }
1250
1251 /**
1252 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1253 * TODO: relative addressing!
1254 */
1255 static struct brw_reg
1256 get_src_reg( struct brw_vs_compile *c,
1257 const struct prog_instruction *inst,
1258 GLuint argIndex )
1259 {
1260 const GLuint file = inst->SrcReg[argIndex].File;
1261 const GLint index = inst->SrcReg[argIndex].Index;
1262 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1263
1264 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1265 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1266
1267 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1268 SWIZZLE_ZERO,
1269 SWIZZLE_ZERO,
1270 SWIZZLE_ZERO)) {
1271 return brw_imm_f(0.0f);
1272 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1273 SWIZZLE_ONE,
1274 SWIZZLE_ONE,
1275 SWIZZLE_ONE)) {
1276 if (src->Negate)
1277 return brw_imm_f(-1.0F);
1278 else
1279 return brw_imm_f(1.0F);
1280 } else if (src->File == PROGRAM_CONSTANT) {
1281 const struct gl_program_parameter_list *params;
1282 float f;
1283 int component = -1;
1284
1285 switch (src->Swizzle) {
1286 case SWIZZLE_XXXX:
1287 component = 0;
1288 break;
1289 case SWIZZLE_YYYY:
1290 component = 1;
1291 break;
1292 case SWIZZLE_ZZZZ:
1293 component = 2;
1294 break;
1295 case SWIZZLE_WWWW:
1296 component = 3;
1297 break;
1298 }
1299
1300 if (component >= 0) {
1301 params = c->vp->program.Base.Parameters;
1302 f = params->ParameterValues[src->Index][component];
1303
1304 if (src->Abs)
1305 f = fabs(f);
1306 if (src->Negate)
1307 f = -f;
1308 return brw_imm_f(f);
1309 }
1310 }
1311 }
1312
1313 switch (file) {
1314 case PROGRAM_TEMPORARY:
1315 case PROGRAM_INPUT:
1316 case PROGRAM_OUTPUT:
1317 if (relAddr) {
1318 return deref(c, c->regs[file][0], index, 32);
1319 }
1320 else {
1321 assert(c->regs[file][index].nr != 0);
1322 return c->regs[file][index];
1323 }
1324
1325 case PROGRAM_STATE_VAR:
1326 case PROGRAM_CONSTANT:
1327 case PROGRAM_UNIFORM:
1328 case PROGRAM_ENV_PARAM:
1329 case PROGRAM_LOCAL_PARAM:
1330 if (!relAddr && c->constant_map[index] != -1) {
1331 /* Take from the push constant buffer if possible. */
1332 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1333 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1334 } else {
1335 /* Must be in the pull constant buffer then .*/
1336 assert(c->vp->use_const_buffer);
1337 if (relAddr)
1338 return get_reladdr_constant(c, inst, argIndex);
1339 else
1340 return get_constant(c, inst, argIndex);
1341 }
1342 case PROGRAM_ADDRESS:
1343 assert(index == 0);
1344 return c->regs[file][index];
1345
1346 case PROGRAM_UNDEFINED:
1347 /* this is a normal case since we loop over all three src args */
1348 return brw_null_reg();
1349
1350 case PROGRAM_WRITE_ONLY:
1351 default:
1352 assert(0);
1353 return brw_null_reg();
1354 }
1355 }
1356
1357 /**
1358 * Return the brw reg for the given instruction's src argument.
1359 * Will return mangled results for SWZ op. The emit_swz() function
1360 * ignores this result and recalculates taking extended swizzles into
1361 * account.
1362 */
1363 static struct brw_reg get_arg( struct brw_vs_compile *c,
1364 const struct prog_instruction *inst,
1365 GLuint argIndex )
1366 {
1367 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1368 struct brw_reg reg;
1369
1370 if (src->File == PROGRAM_UNDEFINED)
1371 return brw_null_reg();
1372
1373 reg = get_src_reg(c, inst, argIndex);
1374
1375 /* Convert 3-bit swizzle to 2-bit.
1376 */
1377 if (reg.file != BRW_IMMEDIATE_VALUE) {
1378 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1379 GET_SWZ(src->Swizzle, 1),
1380 GET_SWZ(src->Swizzle, 2),
1381 GET_SWZ(src->Swizzle, 3));
1382 }
1383
1384 /* Note this is ok for non-swizzle instructions:
1385 */
1386 reg.negate = src->Negate ? 1 : 0;
1387
1388 return reg;
1389 }
1390
1391
1392 /**
1393 * Get brw register for the given program dest register.
1394 */
1395 static struct brw_reg get_dst( struct brw_vs_compile *c,
1396 struct prog_dst_register dst )
1397 {
1398 struct brw_reg reg;
1399
1400 switch (dst.File) {
1401 case PROGRAM_TEMPORARY:
1402 case PROGRAM_OUTPUT:
1403 /* register-indirect addressing is only 1x1, not VxH, for
1404 * destination regs. So, for RelAddr we'll return a temporary
1405 * for the dest and do a move of the result to the RelAddr
1406 * register after the instruction emit.
1407 */
1408 if (dst.RelAddr) {
1409 reg = get_tmp(c);
1410 } else {
1411 assert(c->regs[dst.File][dst.Index].nr != 0);
1412 reg = c->regs[dst.File][dst.Index];
1413 }
1414 break;
1415 case PROGRAM_ADDRESS:
1416 assert(dst.Index == 0);
1417 reg = c->regs[dst.File][dst.Index];
1418 break;
1419 case PROGRAM_UNDEFINED:
1420 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1421 reg = brw_null_reg();
1422 break;
1423 default:
1424 assert(0);
1425 reg = brw_null_reg();
1426 }
1427
1428 assert(reg.type != BRW_IMMEDIATE_VALUE);
1429 reg.dw1.bits.writemask = dst.WriteMask;
1430
1431 return reg;
1432 }
1433
1434
1435 static void emit_swz( struct brw_vs_compile *c,
1436 struct brw_reg dst,
1437 const struct prog_instruction *inst)
1438 {
1439 const GLuint argIndex = 0;
1440 const struct prog_src_register src = inst->SrcReg[argIndex];
1441 struct brw_compile *p = &c->func;
1442 GLuint zeros_mask = 0;
1443 GLuint ones_mask = 0;
1444 GLuint src_mask = 0;
1445 GLubyte src_swz[4];
1446 GLboolean need_tmp = (src.Negate &&
1447 dst.file != BRW_GENERAL_REGISTER_FILE);
1448 struct brw_reg tmp = dst;
1449 GLuint i;
1450
1451 if (need_tmp)
1452 tmp = get_tmp(c);
1453
1454 for (i = 0; i < 4; i++) {
1455 if (dst.dw1.bits.writemask & (1<<i)) {
1456 GLubyte s = GET_SWZ(src.Swizzle, i);
1457 switch (s) {
1458 case SWIZZLE_X:
1459 case SWIZZLE_Y:
1460 case SWIZZLE_Z:
1461 case SWIZZLE_W:
1462 src_mask |= 1<<i;
1463 src_swz[i] = s;
1464 break;
1465 case SWIZZLE_ZERO:
1466 zeros_mask |= 1<<i;
1467 break;
1468 case SWIZZLE_ONE:
1469 ones_mask |= 1<<i;
1470 break;
1471 }
1472 }
1473 }
1474
1475 /* Do src first, in case dst aliases src:
1476 */
1477 if (src_mask) {
1478 struct brw_reg arg0;
1479
1480 arg0 = get_src_reg(c, inst, argIndex);
1481
1482 arg0 = brw_swizzle(arg0,
1483 src_swz[0], src_swz[1],
1484 src_swz[2], src_swz[3]);
1485
1486 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1487 }
1488
1489 if (zeros_mask)
1490 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1491
1492 if (ones_mask)
1493 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1494
1495 if (src.Negate)
1496 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1497
1498 if (need_tmp) {
1499 brw_MOV(p, dst, tmp);
1500 release_tmp(c, tmp);
1501 }
1502 }
1503
1504
1505 /**
1506 * Post-vertex-program processing. Send the results to the URB.
1507 */
1508 static void emit_vertex_write( struct brw_vs_compile *c)
1509 {
1510 struct brw_compile *p = &c->func;
1511 struct brw_context *brw = p->brw;
1512 struct intel_context *intel = &brw->intel;
1513 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1514 struct brw_reg ndc;
1515 int eot;
1516 GLuint len_vertex_header = 2;
1517 int next_mrf, i;
1518
1519 if (c->key.copy_edgeflag) {
1520 brw_MOV(p,
1521 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1522 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1523 }
1524
1525 if (intel->gen < 6) {
1526 /* Build ndc coords */
1527 ndc = get_tmp(c);
1528 /* ndc = 1.0 / pos.w */
1529 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1530 /* ndc.xyz = pos * ndc */
1531 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1532 }
1533
1534 /* Update the header for point size, user clipping flags, and -ve rhw
1535 * workaround.
1536 */
1537 if (intel->gen >= 6) {
1538 struct brw_reg m1 = brw_message_reg(1);
1539
1540 /* On gen6, m1 has each value in a separate dword, so we never
1541 * need to mess with a temporary for computing the m1 value.
1542 */
1543 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1544 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1545 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1546 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1547 }
1548
1549 /* Set the user clip distances in dword 8-15. (m3-4)*/
1550 if (c->key.nr_userclip) {
1551 for (i = 0; i < c->key.nr_userclip; i++) {
1552 struct brw_reg m;
1553 if (i < 4)
1554 m = brw_message_reg(3);
1555 else
1556 m = brw_message_reg(4);
1557
1558 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1559 }
1560 }
1561 } else if ((c->prog_data.outputs_written &
1562 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1563 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1564 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1565 GLuint i;
1566
1567 brw_MOV(p, header1, brw_imm_ud(0));
1568
1569 brw_set_access_mode(p, BRW_ALIGN_16);
1570
1571 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1572 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1573 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1574 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1575 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1576 header1, brw_imm_ud(0x7ff<<8));
1577 }
1578
1579 for (i = 0; i < c->key.nr_userclip; i++) {
1580 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1581 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1582 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1583 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1584 }
1585
1586 /* i965 clipping workaround:
1587 * 1) Test for -ve rhw
1588 * 2) If set,
1589 * set ndc = (0,0,0,0)
1590 * set ucp[6] = 1
1591 *
1592 * Later, clipping will detect ucp[6] and ensure the primitive is
1593 * clipped against all fixed planes.
1594 */
1595 if (brw->has_negative_rhw_bug) {
1596 brw_CMP(p,
1597 vec8(brw_null_reg()),
1598 BRW_CONDITIONAL_L,
1599 brw_swizzle1(ndc, 3),
1600 brw_imm_f(0));
1601
1602 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1603 brw_MOV(p, ndc, brw_imm_f(0));
1604 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1605 }
1606
1607 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1608 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1609 brw_set_access_mode(p, BRW_ALIGN_16);
1610
1611 release_tmp(c, header1);
1612 }
1613 else {
1614 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1615 }
1616
1617 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1618 * of zeros followed by two sets of NDC coordinates:
1619 */
1620 brw_set_access_mode(p, BRW_ALIGN_1);
1621 brw_set_acc_write_control(p, 0);
1622
1623 /* The VUE layout is documented in Volume 2a. */
1624 if (intel->gen >= 6) {
1625 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1626 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1627 * dword 4-7 (m2) is the 4D space position
1628 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1629 * enabled.
1630 * m3 or 5 is the first vertex element data we fill, which is
1631 * the vertex position.
1632 */
1633 brw_MOV(p, brw_message_reg(2), pos);
1634 len_vertex_header = 1;
1635 if (c->key.nr_userclip > 0)
1636 len_vertex_header += 2;
1637 } else if (intel->gen == 5) {
1638 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1639 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1640 * dword 4-7 (m2) is the ndc position (set above)
1641 * dword 8-11 (m3) of the vertex header is the 4D space position
1642 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1643 * m6 is a pad so that the vertex element data is aligned
1644 * m7 is the first vertex data we fill, which is the vertex position.
1645 */
1646 brw_MOV(p, brw_message_reg(2), ndc);
1647 brw_MOV(p, brw_message_reg(3), pos);
1648 brw_MOV(p, brw_message_reg(7), pos);
1649 len_vertex_header = 6;
1650 } else {
1651 /* There are 8 dwords in VUE header pre-Ironlake:
1652 * dword 0-3 (m1) is indices, point width, clip flags.
1653 * dword 4-7 (m2) is ndc position (set above)
1654 *
1655 * dword 8-11 (m3) is the first vertex data, which we always have be the
1656 * vertex position.
1657 */
1658 brw_MOV(p, brw_message_reg(2), ndc);
1659 brw_MOV(p, brw_message_reg(3), pos);
1660 len_vertex_header = 2;
1661 }
1662
1663 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1664 next_mrf = 2 + len_vertex_header;
1665 for (i = 0; i < VERT_RESULT_MAX; i++) {
1666 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1667 break;
1668 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1669 continue;
1670 if (i == VERT_RESULT_PSIZ)
1671 continue;
1672
1673 if (i >= VERT_RESULT_TEX0 &&
1674 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1675 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1676 next_mrf++;
1677 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1678 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1679 }
1680 }
1681
1682 eot = (c->first_overflow_output == 0);
1683
1684 brw_urb_WRITE(p,
1685 brw_null_reg(), /* dest */
1686 0, /* starting mrf reg nr */
1687 c->r0, /* src */
1688 0, /* allocate */
1689 1, /* used */
1690 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1691 0, /* response len */
1692 eot, /* eot */
1693 eot, /* writes complete */
1694 0, /* urb destination offset */
1695 BRW_URB_SWIZZLE_INTERLEAVE);
1696
1697 if (c->first_overflow_output > 0) {
1698 /* Not all of the vertex outputs/results fit into the MRF.
1699 * Move the overflowed attributes from the GRF to the MRF and
1700 * issue another brw_urb_WRITE().
1701 */
1702 GLuint i, mrf = 1;
1703 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1704 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1705 /* move from GRF to MRF */
1706 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1707 mrf++;
1708 }
1709 }
1710
1711 brw_urb_WRITE(p,
1712 brw_null_reg(), /* dest */
1713 0, /* starting mrf reg nr */
1714 c->r0, /* src */
1715 0, /* allocate */
1716 1, /* used */
1717 mrf, /* msg len */
1718 0, /* response len */
1719 1, /* eot */
1720 1, /* writes complete */
1721 14 / 2, /* urb destination offset */
1722 BRW_URB_SWIZZLE_INTERLEAVE);
1723 }
1724 }
1725
1726 static GLboolean
1727 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1728 {
1729 struct brw_compile *p = &c->func;
1730 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1731
1732 if (p->nr_insn == 0)
1733 return GL_FALSE;
1734
1735 if (val.address_mode != BRW_ADDRESS_DIRECT)
1736 return GL_FALSE;
1737
1738 switch (prev_insn->header.opcode) {
1739 case BRW_OPCODE_MOV:
1740 case BRW_OPCODE_MAC:
1741 case BRW_OPCODE_MUL:
1742 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1743 prev_insn->header.execution_size == val.width &&
1744 prev_insn->bits1.da1.dest_reg_file == val.file &&
1745 prev_insn->bits1.da1.dest_reg_type == val.type &&
1746 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1747 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1748 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1749 prev_insn->bits1.da16.dest_writemask == 0xf)
1750 return GL_TRUE;
1751 else
1752 return GL_FALSE;
1753 default:
1754 return GL_FALSE;
1755 }
1756 }
1757
1758 static uint32_t
1759 get_predicate(const struct prog_instruction *inst)
1760 {
1761 if (inst->DstReg.CondMask == COND_TR)
1762 return BRW_PREDICATE_NONE;
1763
1764 /* All of GLSL only produces predicates for COND_NE and one channel per
1765 * vector. Fail badly if someone starts doing something else, as it might
1766 * mean infinite looping or something.
1767 *
1768 * We'd like to support all the condition codes, but our hardware doesn't
1769 * quite match the Mesa IR, which is modeled after the NV extensions. For
1770 * those, the instruction may update the condition codes or not, then any
1771 * later instruction may use one of those condition codes. For gen4, the
1772 * instruction may update the flags register based on one of the condition
1773 * codes output by the instruction, and then further instructions may
1774 * predicate on that. We can probably support this, but it won't
1775 * necessarily be easy.
1776 */
1777 assert(inst->DstReg.CondMask == COND_NE);
1778
1779 switch (inst->DstReg.CondSwizzle) {
1780 case SWIZZLE_XXXX:
1781 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1782 case SWIZZLE_YYYY:
1783 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1784 case SWIZZLE_ZZZZ:
1785 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1786 case SWIZZLE_WWWW:
1787 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1788 default:
1789 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1790 inst->DstReg.CondMask);
1791 return BRW_PREDICATE_NORMAL;
1792 }
1793 }
1794
1795 /* Emit the vertex program instructions here.
1796 */
1797 void brw_vs_emit(struct brw_vs_compile *c )
1798 {
1799 #define MAX_IF_DEPTH 32
1800 #define MAX_LOOP_DEPTH 32
1801 struct brw_compile *p = &c->func;
1802 struct brw_context *brw = p->brw;
1803 struct intel_context *intel = &brw->intel;
1804 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1805 GLuint insn, if_depth = 0, loop_depth = 0;
1806 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1807 int if_depth_in_loop[MAX_LOOP_DEPTH];
1808 const struct brw_indirect stack_index = brw_indirect(0, 0);
1809 GLuint index;
1810 GLuint file;
1811
1812 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1813 printf("vs-mesa:\n");
1814 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1815 GL_TRUE);
1816 printf("\n");
1817 }
1818
1819 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1820 brw_set_access_mode(p, BRW_ALIGN_16);
1821 if_depth_in_loop[loop_depth] = 0;
1822
1823 brw_set_acc_write_control(p, 1);
1824
1825 for (insn = 0; insn < nr_insns; insn++) {
1826 GLuint i;
1827 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1828
1829 /* Message registers can't be read, so copy the output into GRF
1830 * register if they are used in source registers
1831 */
1832 for (i = 0; i < 3; i++) {
1833 struct prog_src_register *src = &inst->SrcReg[i];
1834 GLuint index = src->Index;
1835 GLuint file = src->File;
1836 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1837 c->output_regs[index].used_in_src = GL_TRUE;
1838 }
1839
1840 switch (inst->Opcode) {
1841 case OPCODE_CAL:
1842 case OPCODE_RET:
1843 c->needs_stack = GL_TRUE;
1844 break;
1845 default:
1846 break;
1847 }
1848 }
1849
1850 /* Static register allocation
1851 */
1852 brw_vs_alloc_regs(c);
1853
1854 if (c->needs_stack)
1855 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1856
1857 for (insn = 0; insn < nr_insns; insn++) {
1858
1859 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1860 struct brw_reg args[3], dst;
1861 GLuint i;
1862
1863 #if 0
1864 printf("%d: ", insn);
1865 _mesa_print_instruction(inst);
1866 #endif
1867
1868 /* Get argument regs. SWZ is special and does this itself.
1869 */
1870 if (inst->Opcode != OPCODE_SWZ)
1871 for (i = 0; i < 3; i++) {
1872 const struct prog_src_register *src = &inst->SrcReg[i];
1873 index = src->Index;
1874 file = src->File;
1875 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1876 args[i] = c->output_regs[index].reg;
1877 else
1878 args[i] = get_arg(c, inst, i);
1879 }
1880
1881 /* Get dest regs. Note that it is possible for a reg to be both
1882 * dst and arg, given the static allocation of registers. So
1883 * care needs to be taken emitting multi-operation instructions.
1884 */
1885 index = inst->DstReg.Index;
1886 file = inst->DstReg.File;
1887 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1888 dst = c->output_regs[index].reg;
1889 else
1890 dst = get_dst(c, inst->DstReg);
1891
1892 if (inst->SaturateMode != SATURATE_OFF) {
1893 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1894 inst->SaturateMode);
1895 }
1896
1897 switch (inst->Opcode) {
1898 case OPCODE_ABS:
1899 brw_MOV(p, dst, brw_abs(args[0]));
1900 break;
1901 case OPCODE_ADD:
1902 brw_ADD(p, dst, args[0], args[1]);
1903 break;
1904 case OPCODE_COS:
1905 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1906 break;
1907 case OPCODE_DP2:
1908 brw_DP2(p, dst, args[0], args[1]);
1909 break;
1910 case OPCODE_DP3:
1911 brw_DP3(p, dst, args[0], args[1]);
1912 break;
1913 case OPCODE_DP4:
1914 brw_DP4(p, dst, args[0], args[1]);
1915 break;
1916 case OPCODE_DPH:
1917 brw_DPH(p, dst, args[0], args[1]);
1918 break;
1919 case OPCODE_NRM3:
1920 emit_nrm(c, dst, args[0], 3);
1921 break;
1922 case OPCODE_NRM4:
1923 emit_nrm(c, dst, args[0], 4);
1924 break;
1925 case OPCODE_DST:
1926 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1927 break;
1928 case OPCODE_EXP:
1929 unalias1(c, dst, args[0], emit_exp_noalias);
1930 break;
1931 case OPCODE_EX2:
1932 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1933 break;
1934 case OPCODE_ARL:
1935 brw_RNDD(p, dst, args[0]);
1936 break;
1937 case OPCODE_FLR:
1938 brw_RNDD(p, dst, args[0]);
1939 break;
1940 case OPCODE_FRC:
1941 brw_FRC(p, dst, args[0]);
1942 break;
1943 case OPCODE_LOG:
1944 unalias1(c, dst, args[0], emit_log_noalias);
1945 break;
1946 case OPCODE_LG2:
1947 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1948 break;
1949 case OPCODE_LIT:
1950 unalias1(c, dst, args[0], emit_lit_noalias);
1951 break;
1952 case OPCODE_LRP:
1953 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1954 break;
1955 case OPCODE_MAD:
1956 if (!accumulator_contains(c, args[2]))
1957 brw_MOV(p, brw_acc_reg(), args[2]);
1958 brw_MAC(p, dst, args[0], args[1]);
1959 break;
1960 case OPCODE_CMP:
1961 emit_cmp(p, dst, args[0], args[1], args[2]);
1962 break;
1963 case OPCODE_MAX:
1964 emit_max(p, dst, args[0], args[1]);
1965 break;
1966 case OPCODE_MIN:
1967 emit_min(p, dst, args[0], args[1]);
1968 break;
1969 case OPCODE_MOV:
1970 brw_MOV(p, dst, args[0]);
1971 break;
1972 case OPCODE_MUL:
1973 brw_MUL(p, dst, args[0], args[1]);
1974 break;
1975 case OPCODE_POW:
1976 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1977 break;
1978 case OPCODE_RCP:
1979 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1980 break;
1981 case OPCODE_RSQ:
1982 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
1983 break;
1984
1985 case OPCODE_SEQ:
1986 unalias2(c, dst, args[0], args[1], emit_seq);
1987 break;
1988 case OPCODE_SIN:
1989 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1990 break;
1991 case OPCODE_SNE:
1992 unalias2(c, dst, args[0], args[1], emit_sne);
1993 break;
1994 case OPCODE_SGE:
1995 unalias2(c, dst, args[0], args[1], emit_sge);
1996 break;
1997 case OPCODE_SGT:
1998 unalias2(c, dst, args[0], args[1], emit_sgt);
1999 break;
2000 case OPCODE_SLT:
2001 unalias2(c, dst, args[0], args[1], emit_slt);
2002 break;
2003 case OPCODE_SLE:
2004 unalias2(c, dst, args[0], args[1], emit_sle);
2005 break;
2006 case OPCODE_SSG:
2007 unalias1(c, dst, args[0], emit_sign);
2008 break;
2009 case OPCODE_SUB:
2010 brw_ADD(p, dst, args[0], negate(args[1]));
2011 break;
2012 case OPCODE_SWZ:
2013 /* The args[0] value can't be used here as it won't have
2014 * correctly encoded the full swizzle:
2015 */
2016 emit_swz(c, dst, inst);
2017 break;
2018 case OPCODE_TRUNC:
2019 /* round toward zero */
2020 brw_RNDZ(p, dst, args[0]);
2021 break;
2022 case OPCODE_XPD:
2023 emit_xpd(p, dst, args[0], args[1]);
2024 break;
2025 case OPCODE_IF:
2026 assert(if_depth < MAX_IF_DEPTH);
2027 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
2028 /* Note that brw_IF smashes the predicate_control field. */
2029 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
2030 if_depth_in_loop[loop_depth]++;
2031 if_depth++;
2032 break;
2033 case OPCODE_ELSE:
2034 clear_current_const(c);
2035 assert(if_depth > 0);
2036 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2037 break;
2038 case OPCODE_ENDIF:
2039 clear_current_const(c);
2040 assert(if_depth > 0);
2041 brw_ENDIF(p, if_inst[--if_depth]);
2042 if_depth_in_loop[loop_depth]--;
2043 break;
2044 case OPCODE_BGNLOOP:
2045 clear_current_const(c);
2046 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2047 if_depth_in_loop[loop_depth] = 0;
2048 break;
2049 case OPCODE_BRK:
2050 brw_set_predicate_control(p, get_predicate(inst));
2051 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2052 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2053 break;
2054 case OPCODE_CONT:
2055 brw_set_predicate_control(p, get_predicate(inst));
2056 if (intel->gen >= 6) {
2057 brw_CONT_gen6(p, loop_inst[loop_depth - 1]);
2058 } else {
2059 brw_CONT(p, if_depth_in_loop[loop_depth]);
2060 }
2061 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2062 break;
2063
2064 case OPCODE_ENDLOOP: {
2065 clear_current_const(c);
2066 struct brw_instruction *inst0, *inst1;
2067 GLuint br = 1;
2068
2069 loop_depth--;
2070
2071 if (intel->gen == 5)
2072 br = 2;
2073
2074 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2075
2076 if (intel->gen < 6) {
2077 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2078 while (inst0 > loop_inst[loop_depth]) {
2079 inst0--;
2080 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2081 inst0->bits3.if_else.jump_count == 0) {
2082 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2083 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2084 inst0->bits3.if_else.jump_count == 0) {
2085 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2086 }
2087 }
2088 }
2089 }
2090 break;
2091
2092 case OPCODE_BRA:
2093 brw_set_predicate_control(p, get_predicate(inst));
2094 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2095 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2096 break;
2097 case OPCODE_CAL:
2098 brw_set_access_mode(p, BRW_ALIGN_1);
2099 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2100 brw_set_access_mode(p, BRW_ALIGN_16);
2101 brw_ADD(p, get_addr_reg(stack_index),
2102 get_addr_reg(stack_index), brw_imm_d(4));
2103 brw_save_call(p, inst->Comment, p->nr_insn);
2104 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2105 break;
2106 case OPCODE_RET:
2107 brw_ADD(p, get_addr_reg(stack_index),
2108 get_addr_reg(stack_index), brw_imm_d(-4));
2109 brw_set_access_mode(p, BRW_ALIGN_1);
2110 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2111 brw_set_access_mode(p, BRW_ALIGN_16);
2112 break;
2113 case OPCODE_END:
2114 emit_vertex_write(c);
2115 break;
2116 case OPCODE_PRINT:
2117 /* no-op */
2118 break;
2119 case OPCODE_BGNSUB:
2120 brw_save_label(p, inst->Comment, p->nr_insn);
2121 break;
2122 case OPCODE_ENDSUB:
2123 /* no-op */
2124 break;
2125 default:
2126 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2127 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2128 _mesa_opcode_string(inst->Opcode) :
2129 "unknown");
2130 }
2131
2132 /* Set the predication update on the last instruction of the native
2133 * instruction sequence.
2134 *
2135 * This would be problematic if it was set on a math instruction,
2136 * but that shouldn't be the case with the current GLSL compiler.
2137 */
2138 if (inst->CondUpdate) {
2139 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2140
2141 assert(hw_insn->header.destreg__conditionalmod == 0);
2142 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2143 }
2144
2145 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2146 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2147 && c->output_regs[inst->DstReg.Index].used_in_src) {
2148 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2149 }
2150
2151 /* Result color clamping.
2152 *
2153 * When destination register is an output register and
2154 * it's primary/secondary front/back color, we have to clamp
2155 * the result to [0,1]. This is done by enabling the
2156 * saturation bit for the last instruction.
2157 *
2158 * We don't use brw_set_saturate() as it modifies
2159 * p->current->header.saturate, which affects all the subsequent
2160 * instructions. Instead, we directly modify the header
2161 * of the last (already stored) instruction.
2162 */
2163 if (inst->DstReg.File == PROGRAM_OUTPUT) {
2164 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2165 || (inst->DstReg.Index == VERT_RESULT_COL1)
2166 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2167 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2168 p->store[p->nr_insn-1].header.saturate = 1;
2169 }
2170 }
2171
2172 if (inst->DstReg.RelAddr) {
2173 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2174 inst->DstReg.File == PROGRAM_OUTPUT);
2175 move_to_reladdr_dst(c, inst, dst);
2176 }
2177
2178 release_tmps(c);
2179 }
2180
2181 brw_resolve_cals(p);
2182 brw_set_uip_jip(p);
2183
2184 brw_optimize(p);
2185
2186 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2187 int i;
2188
2189 printf("vs-native:\n");
2190 for (i = 0; i < p->nr_insn; i++)
2191 brw_disasm(stdout, &p->store[i], intel->gen);
2192 printf("\n");
2193 }
2194 }