i965: Drop push-mode reladdr constant loading and always use constant_map.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf;
144 int attributes_in_vue;
145 int first_reladdr_output;
146 int max_constant;
147 int constant = 0;
148
149 /* Determine whether to use a real constant buffer or use a block
150 * of GRF registers for constants. The later is faster but only
151 * works if everything fits in the GRF.
152 * XXX this heuristic/check may need some fine tuning...
153 */
154 if (c->vp->program.Base.Parameters->NumParameters +
155 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
156 c->vp->use_const_buffer = GL_TRUE;
157 else
158 c->vp->use_const_buffer = GL_FALSE;
159
160 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
161
162 /* r0 -- reserved as usual
163 */
164 c->r0 = brw_vec8_grf(reg, 0);
165 reg++;
166
167 /* User clip planes from curbe:
168 */
169 if (c->key.nr_userclip) {
170 if (intel->gen >= 6) {
171 for (i = 0; i < c->key.nr_userclip; i++) {
172 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
173 (i % 2) * 4), 0, 4, 1);
174 }
175 reg += ALIGN(c->key.nr_userclip, 2) / 2;
176 } else {
177 for (i = 0; i < c->key.nr_userclip; i++) {
178 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
179 (i % 2) * 4), 0, 4, 1);
180 }
181 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
182 }
183
184 }
185
186 /* Assign some (probably all) of the vertex program constants to
187 * the push constant buffer/CURBE.
188 *
189 * There's an obvious limit to the numer of push constants equal to
190 * the number of register available, and that number is smaller
191 * than the minimum maximum number of vertex program parameters, so
192 * support for pull constants is required if we overflow.
193 * Additionally, on gen6 the number of push constants is even
194 * lower.
195 *
196 * When there's relative addressing, we don't know what range of
197 * Mesa IR registers can be accessed. And generally, when relative
198 * addressing is used we also have too many constants to load them
199 * all as push constants. So, we'll just support relative
200 * addressing out of the pull constant buffers, and try to load as
201 * many statically-accessed constants into the push constant buffer
202 * as we can.
203 */
204 if (intel->gen >= 6) {
205 /* We can only load 32 regs of push constants. */
206 max_constant = 32 * 2 - c->key.nr_userclip;
207 } else {
208 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
209 }
210
211 /* constant_map maps from ParameterValues[] index to index in the
212 * push constant buffer, or -1 if it's only in the pull constant
213 * buffer.
214 */
215 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
216 for (i = 0;
217 i < c->vp->program.Base.NumInstructions && constant < max_constant;
218 i++) {
219 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
220 int arg;
221
222 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
223 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
224 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
225 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
226 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
227 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
228 continue;
229 }
230
231 if (inst->SrcReg[arg].RelAddr) {
232 c->vp->use_const_buffer = GL_TRUE;
233 continue;
234 }
235
236 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
237 c->constant_map[inst->SrcReg[arg].Index] = constant++;
238 }
239 }
240 }
241
242 /* If we ran out of push constant space, then we'll also upload all
243 * constants through the pull constant buffer so that they can be
244 * accessed no matter what. For relative addressing (the common
245 * case) we need them all in place anyway.
246 */
247 if (constant == max_constant)
248 c->vp->use_const_buffer = GL_TRUE;
249
250 for (i = 0; i < constant; i++) {
251 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
252 (i % 2) * 4),
253 0, 4, 1);
254 }
255 reg += (constant + 1) / 2;
256 c->prog_data.curb_read_length = reg - 1;
257 c->prog_data.nr_params = constant;
258 /* XXX 0 causes a bug elsewhere... */
259 if (intel->gen < 6 && c->prog_data.nr_params == 0)
260 c->prog_data.nr_params = 4;
261
262 /* Allocate input regs:
263 */
264 c->nr_inputs = 0;
265 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
266 if (c->prog_data.inputs_read & (1 << i)) {
267 c->nr_inputs++;
268 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
269 reg++;
270 }
271 }
272 /* If there are no inputs, we'll still be reading one attribute's worth
273 * because it's required -- see urb_read_length setting.
274 */
275 if (c->nr_inputs == 0)
276 reg++;
277
278 /* Allocate outputs. The non-position outputs go straight into message regs.
279 */
280 c->nr_outputs = 0;
281 c->first_output = reg;
282 c->first_overflow_output = 0;
283
284 if (intel->gen >= 6) {
285 mrf = 3;
286 if (c->key.nr_userclip)
287 mrf += 2;
288 } else if (intel->gen == 5)
289 mrf = 8;
290 else
291 mrf = 4;
292
293 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
294 for (i = 0; i < VERT_RESULT_MAX; i++) {
295 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
296 c->nr_outputs++;
297 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
298 if (i == VERT_RESULT_HPOS) {
299 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
300 reg++;
301 }
302 else if (i == VERT_RESULT_PSIZ) {
303 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
304 reg++;
305 }
306 else {
307 /* Two restrictions on our compute-to-MRF here. The
308 * message length for all SEND messages is restricted to
309 * [1,15], so we can't use mrf 15, as that means a length
310 * of 16.
311 *
312 * Additionally, URB writes are aligned to URB rows, so we
313 * need to put an even number of registers of URB data in
314 * each URB write so that the later write is aligned. A
315 * message length of 15 means 1 message header reg plus 14
316 * regs of URB data.
317 *
318 * For attributes beyond the compute-to-MRF, we compute to
319 * GRFs and they will be written in the second URB_WRITE.
320 */
321 if (first_reladdr_output > i && mrf < 15) {
322 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
323 mrf++;
324 }
325 else {
326 if (mrf >= 15 && !c->first_overflow_output)
327 c->first_overflow_output = i;
328 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
329 reg++;
330 mrf++;
331 }
332 }
333 }
334 }
335
336 /* Allocate program temporaries:
337 */
338 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
339 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
340 reg++;
341 }
342
343 /* Address reg(s). Don't try to use the internal address reg until
344 * deref time.
345 */
346 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
347 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
348 reg,
349 0,
350 BRW_REGISTER_TYPE_D,
351 BRW_VERTICAL_STRIDE_8,
352 BRW_WIDTH_8,
353 BRW_HORIZONTAL_STRIDE_1,
354 BRW_SWIZZLE_XXXX,
355 WRITEMASK_X);
356 reg++;
357 }
358
359 if (c->vp->use_const_buffer) {
360 for (i = 0; i < 3; i++) {
361 c->current_const[i].reg = brw_vec8_grf(reg, 0);
362 reg++;
363 }
364 clear_current_const(c);
365 }
366
367 for (i = 0; i < 128; i++) {
368 if (c->output_regs[i].used_in_src) {
369 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
370 reg++;
371 }
372 }
373
374 if (c->needs_stack) {
375 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
376 reg += 2;
377 }
378
379 /* Some opcodes need an internal temporary:
380 */
381 c->first_tmp = reg;
382 c->last_tmp = reg; /* for allocation purposes */
383
384 /* Each input reg holds data from two vertices. The
385 * urb_read_length is the number of registers read from *each*
386 * vertex urb, so is half the amount:
387 */
388 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
389 /* Setting this field to 0 leads to undefined behavior according to the
390 * the VS_STATE docs. Our VUEs will always have at least one attribute
391 * sitting in them, even if it's padding.
392 */
393 if (c->prog_data.urb_read_length == 0)
394 c->prog_data.urb_read_length = 1;
395
396 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
397 * them to fit the biggest thing they need to.
398 */
399 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
400
401 /* See emit_vertex_write() for where the VUE's overhead on top of the
402 * attributes comes from.
403 */
404 if (intel->gen >= 6) {
405 int header_regs = 2;
406 if (c->key.nr_userclip)
407 header_regs += 2;
408
409 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
410 } else if (intel->gen == 5)
411 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
412 else
413 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
414
415 c->prog_data.total_grf = reg;
416
417 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
418 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
419 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
420 printf("%s reg = %d\n", __FUNCTION__, reg);
421 }
422 }
423
424
425 /**
426 * If an instruction uses a temp reg both as a src and the dest, we
427 * sometimes need to allocate an intermediate temporary.
428 */
429 static void unalias1( struct brw_vs_compile *c,
430 struct brw_reg dst,
431 struct brw_reg arg0,
432 void (*func)( struct brw_vs_compile *,
433 struct brw_reg,
434 struct brw_reg ))
435 {
436 if (dst.file == arg0.file && dst.nr == arg0.nr) {
437 struct brw_compile *p = &c->func;
438 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
439 func(c, tmp, arg0);
440 brw_MOV(p, dst, tmp);
441 release_tmp(c, tmp);
442 }
443 else {
444 func(c, dst, arg0);
445 }
446 }
447
448 /**
449 * \sa unalias2
450 * Checkes if 2-operand instruction needs an intermediate temporary.
451 */
452 static void unalias2( struct brw_vs_compile *c,
453 struct brw_reg dst,
454 struct brw_reg arg0,
455 struct brw_reg arg1,
456 void (*func)( struct brw_vs_compile *,
457 struct brw_reg,
458 struct brw_reg,
459 struct brw_reg ))
460 {
461 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
462 (dst.file == arg1.file && dst.nr == arg1.nr)) {
463 struct brw_compile *p = &c->func;
464 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
465 func(c, tmp, arg0, arg1);
466 brw_MOV(p, dst, tmp);
467 release_tmp(c, tmp);
468 }
469 else {
470 func(c, dst, arg0, arg1);
471 }
472 }
473
474 /**
475 * \sa unalias2
476 * Checkes if 3-operand instruction needs an intermediate temporary.
477 */
478 static void unalias3( struct brw_vs_compile *c,
479 struct brw_reg dst,
480 struct brw_reg arg0,
481 struct brw_reg arg1,
482 struct brw_reg arg2,
483 void (*func)( struct brw_vs_compile *,
484 struct brw_reg,
485 struct brw_reg,
486 struct brw_reg,
487 struct brw_reg ))
488 {
489 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
490 (dst.file == arg1.file && dst.nr == arg1.nr) ||
491 (dst.file == arg2.file && dst.nr == arg2.nr)) {
492 struct brw_compile *p = &c->func;
493 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
494 func(c, tmp, arg0, arg1, arg2);
495 brw_MOV(p, dst, tmp);
496 release_tmp(c, tmp);
497 }
498 else {
499 func(c, dst, arg0, arg1, arg2);
500 }
501 }
502
503 static void emit_sop( struct brw_vs_compile *c,
504 struct brw_reg dst,
505 struct brw_reg arg0,
506 struct brw_reg arg1,
507 GLuint cond)
508 {
509 struct brw_compile *p = &c->func;
510
511 brw_MOV(p, dst, brw_imm_f(0.0f));
512 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
513 brw_MOV(p, dst, brw_imm_f(1.0f));
514 brw_set_predicate_control_flag_value(p, 0xff);
515 }
516
517 static void emit_seq( struct brw_vs_compile *c,
518 struct brw_reg dst,
519 struct brw_reg arg0,
520 struct brw_reg arg1 )
521 {
522 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
523 }
524
525 static void emit_sne( struct brw_vs_compile *c,
526 struct brw_reg dst,
527 struct brw_reg arg0,
528 struct brw_reg arg1 )
529 {
530 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
531 }
532 static void emit_slt( struct brw_vs_compile *c,
533 struct brw_reg dst,
534 struct brw_reg arg0,
535 struct brw_reg arg1 )
536 {
537 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
538 }
539
540 static void emit_sle( struct brw_vs_compile *c,
541 struct brw_reg dst,
542 struct brw_reg arg0,
543 struct brw_reg arg1 )
544 {
545 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
546 }
547
548 static void emit_sgt( struct brw_vs_compile *c,
549 struct brw_reg dst,
550 struct brw_reg arg0,
551 struct brw_reg arg1 )
552 {
553 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
554 }
555
556 static void emit_sge( struct brw_vs_compile *c,
557 struct brw_reg dst,
558 struct brw_reg arg0,
559 struct brw_reg arg1 )
560 {
561 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
562 }
563
564 static void emit_cmp( struct brw_compile *p,
565 struct brw_reg dst,
566 struct brw_reg arg0,
567 struct brw_reg arg1,
568 struct brw_reg arg2 )
569 {
570 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
571 brw_SEL(p, dst, arg1, arg2);
572 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
573 }
574
575 static void emit_sign(struct brw_vs_compile *c,
576 struct brw_reg dst,
577 struct brw_reg arg0)
578 {
579 struct brw_compile *p = &c->func;
580
581 brw_MOV(p, dst, brw_imm_f(0));
582
583 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
584 brw_MOV(p, dst, brw_imm_f(-1.0));
585 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
586
587 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
588 brw_MOV(p, dst, brw_imm_f(1.0));
589 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
590 }
591
592 static void emit_max( struct brw_compile *p,
593 struct brw_reg dst,
594 struct brw_reg arg0,
595 struct brw_reg arg1 )
596 {
597 struct intel_context *intel = &p->brw->intel;
598
599 if (intel->gen >= 6) {
600 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
601 brw_SEL(p, dst, arg0, arg1);
602 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
603 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
604 } else {
605 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
606 brw_SEL(p, dst, arg0, arg1);
607 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
608 }
609 }
610
611 static void emit_min( struct brw_compile *p,
612 struct brw_reg dst,
613 struct brw_reg arg0,
614 struct brw_reg arg1 )
615 {
616 struct intel_context *intel = &p->brw->intel;
617
618 if (intel->gen >= 6) {
619 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
620 brw_SEL(p, dst, arg0, arg1);
621 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
622 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
623 } else {
624 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
625 brw_SEL(p, dst, arg0, arg1);
626 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
627 }
628 }
629
630 static void emit_math1_gen4(struct brw_vs_compile *c,
631 GLuint function,
632 struct brw_reg dst,
633 struct brw_reg arg0,
634 GLuint precision)
635 {
636 /* There are various odd behaviours with SEND on the simulator. In
637 * addition there are documented issues with the fact that the GEN4
638 * processor doesn't do dependency control properly on SEND
639 * results. So, on balance, this kludge to get around failures
640 * with writemasked math results looks like it might be necessary
641 * whether that turns out to be a simulator bug or not:
642 */
643 struct brw_compile *p = &c->func;
644 struct brw_reg tmp = dst;
645 GLboolean need_tmp = GL_FALSE;
646
647 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
648 dst.dw1.bits.writemask != 0xf)
649 need_tmp = GL_TRUE;
650
651 if (need_tmp)
652 tmp = get_tmp(c);
653
654 brw_math(p,
655 tmp,
656 function,
657 BRW_MATH_SATURATE_NONE,
658 2,
659 arg0,
660 BRW_MATH_DATA_SCALAR,
661 precision);
662
663 if (need_tmp) {
664 brw_MOV(p, dst, tmp);
665 release_tmp(c, tmp);
666 }
667 }
668
669 static void
670 emit_math1_gen6(struct brw_vs_compile *c,
671 GLuint function,
672 struct brw_reg dst,
673 struct brw_reg arg0,
674 GLuint precision)
675 {
676 struct brw_compile *p = &c->func;
677 struct brw_reg tmp_src, tmp_dst;
678
679 /* Something is strange on gen6 math in 16-wide mode, though the
680 * docs say it's supposed to work. Punt to using align1 mode,
681 * which doesn't do writemasking and swizzles.
682 */
683 tmp_src = get_tmp(c);
684 tmp_dst = get_tmp(c);
685
686 brw_MOV(p, tmp_src, arg0);
687
688 brw_set_access_mode(p, BRW_ALIGN_1);
689 brw_math(p,
690 tmp_dst,
691 function,
692 BRW_MATH_SATURATE_NONE,
693 2,
694 tmp_src,
695 BRW_MATH_DATA_SCALAR,
696 precision);
697 brw_set_access_mode(p, BRW_ALIGN_16);
698
699 brw_MOV(p, dst, tmp_dst);
700
701 release_tmp(c, tmp_src);
702 release_tmp(c, tmp_dst);
703 }
704
705 static void
706 emit_math1(struct brw_vs_compile *c,
707 GLuint function,
708 struct brw_reg dst,
709 struct brw_reg arg0,
710 GLuint precision)
711 {
712 struct brw_compile *p = &c->func;
713 struct intel_context *intel = &p->brw->intel;
714
715 if (intel->gen >= 6)
716 emit_math1_gen6(c, function, dst, arg0, precision);
717 else
718 emit_math1_gen4(c, function, dst, arg0, precision);
719 }
720
721 static void emit_math2_gen4( struct brw_vs_compile *c,
722 GLuint function,
723 struct brw_reg dst,
724 struct brw_reg arg0,
725 struct brw_reg arg1,
726 GLuint precision)
727 {
728 struct brw_compile *p = &c->func;
729 struct brw_reg tmp = dst;
730 GLboolean need_tmp = GL_FALSE;
731
732 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
733 dst.dw1.bits.writemask != 0xf)
734 need_tmp = GL_TRUE;
735
736 if (need_tmp)
737 tmp = get_tmp(c);
738
739 brw_MOV(p, brw_message_reg(3), arg1);
740
741 brw_math(p,
742 tmp,
743 function,
744 BRW_MATH_SATURATE_NONE,
745 2,
746 arg0,
747 BRW_MATH_DATA_SCALAR,
748 precision);
749
750 if (need_tmp) {
751 brw_MOV(p, dst, tmp);
752 release_tmp(c, tmp);
753 }
754 }
755
756 static void emit_math2_gen6( struct brw_vs_compile *c,
757 GLuint function,
758 struct brw_reg dst,
759 struct brw_reg arg0,
760 struct brw_reg arg1,
761 GLuint precision)
762 {
763 struct brw_compile *p = &c->func;
764 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
765
766 tmp_src0 = get_tmp(c);
767 tmp_src1 = get_tmp(c);
768 tmp_dst = get_tmp(c);
769
770 brw_MOV(p, tmp_src0, arg0);
771 brw_MOV(p, tmp_src1, arg1);
772
773 brw_set_access_mode(p, BRW_ALIGN_1);
774 brw_math2(p,
775 tmp_dst,
776 function,
777 tmp_src0,
778 tmp_src1);
779 brw_set_access_mode(p, BRW_ALIGN_16);
780
781 brw_MOV(p, dst, tmp_dst);
782
783 release_tmp(c, tmp_src0);
784 release_tmp(c, tmp_src1);
785 release_tmp(c, tmp_dst);
786 }
787
788 static void emit_math2( struct brw_vs_compile *c,
789 GLuint function,
790 struct brw_reg dst,
791 struct brw_reg arg0,
792 struct brw_reg arg1,
793 GLuint precision)
794 {
795 struct brw_compile *p = &c->func;
796 struct intel_context *intel = &p->brw->intel;
797
798 if (intel->gen >= 6)
799 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
800 else
801 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
802 }
803
804 static void emit_exp_noalias( struct brw_vs_compile *c,
805 struct brw_reg dst,
806 struct brw_reg arg0 )
807 {
808 struct brw_compile *p = &c->func;
809
810
811 if (dst.dw1.bits.writemask & WRITEMASK_X) {
812 struct brw_reg tmp = get_tmp(c);
813 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
814
815 /* tmp_d = floor(arg0.x) */
816 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
817
818 /* result[0] = 2.0 ^ tmp */
819
820 /* Adjust exponent for floating point:
821 * exp += 127
822 */
823 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
824
825 /* Install exponent and sign.
826 * Excess drops off the edge:
827 */
828 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
829 tmp_d, brw_imm_d(23));
830
831 release_tmp(c, tmp);
832 }
833
834 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
835 /* result[1] = arg0.x - floor(arg0.x) */
836 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
837 }
838
839 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
840 /* As with the LOG instruction, we might be better off just
841 * doing a taylor expansion here, seeing as we have to do all
842 * the prep work.
843 *
844 * If mathbox partial precision is too low, consider also:
845 * result[3] = result[0] * EXP(result[1])
846 */
847 emit_math1(c,
848 BRW_MATH_FUNCTION_EXP,
849 brw_writemask(dst, WRITEMASK_Z),
850 brw_swizzle1(arg0, 0),
851 BRW_MATH_PRECISION_FULL);
852 }
853
854 if (dst.dw1.bits.writemask & WRITEMASK_W) {
855 /* result[3] = 1.0; */
856 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
857 }
858 }
859
860
861 static void emit_log_noalias( struct brw_vs_compile *c,
862 struct brw_reg dst,
863 struct brw_reg arg0 )
864 {
865 struct brw_compile *p = &c->func;
866 struct brw_reg tmp = dst;
867 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
868 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
869 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
870 dst.file != BRW_GENERAL_REGISTER_FILE);
871
872 if (need_tmp) {
873 tmp = get_tmp(c);
874 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
875 }
876
877 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
878 * according to spec:
879 *
880 * These almost look likey they could be joined up, but not really
881 * practical:
882 *
883 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
884 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
885 */
886 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
887 brw_AND(p,
888 brw_writemask(tmp_ud, WRITEMASK_X),
889 brw_swizzle1(arg0_ud, 0),
890 brw_imm_ud((1U<<31)-1));
891
892 brw_SHR(p,
893 brw_writemask(tmp_ud, WRITEMASK_X),
894 tmp_ud,
895 brw_imm_ud(23));
896
897 brw_ADD(p,
898 brw_writemask(tmp, WRITEMASK_X),
899 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
900 brw_imm_d(-127));
901 }
902
903 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
904 brw_AND(p,
905 brw_writemask(tmp_ud, WRITEMASK_Y),
906 brw_swizzle1(arg0_ud, 0),
907 brw_imm_ud((1<<23)-1));
908
909 brw_OR(p,
910 brw_writemask(tmp_ud, WRITEMASK_Y),
911 tmp_ud,
912 brw_imm_ud(127<<23));
913 }
914
915 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
916 /* result[2] = result[0] + LOG2(result[1]); */
917
918 /* Why bother? The above is just a hint how to do this with a
919 * taylor series. Maybe we *should* use a taylor series as by
920 * the time all the above has been done it's almost certainly
921 * quicker than calling the mathbox, even with low precision.
922 *
923 * Options are:
924 * - result[0] + mathbox.LOG2(result[1])
925 * - mathbox.LOG2(arg0.x)
926 * - result[0] + inline_taylor_approx(result[1])
927 */
928 emit_math1(c,
929 BRW_MATH_FUNCTION_LOG,
930 brw_writemask(tmp, WRITEMASK_Z),
931 brw_swizzle1(tmp, 1),
932 BRW_MATH_PRECISION_FULL);
933
934 brw_ADD(p,
935 brw_writemask(tmp, WRITEMASK_Z),
936 brw_swizzle1(tmp, 2),
937 brw_swizzle1(tmp, 0));
938 }
939
940 if (dst.dw1.bits.writemask & WRITEMASK_W) {
941 /* result[3] = 1.0; */
942 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
943 }
944
945 if (need_tmp) {
946 brw_MOV(p, dst, tmp);
947 release_tmp(c, tmp);
948 }
949 }
950
951
952 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
953 */
954 static void emit_dst_noalias( struct brw_vs_compile *c,
955 struct brw_reg dst,
956 struct brw_reg arg0,
957 struct brw_reg arg1)
958 {
959 struct brw_compile *p = &c->func;
960
961 /* There must be a better way to do this:
962 */
963 if (dst.dw1.bits.writemask & WRITEMASK_X)
964 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
965 if (dst.dw1.bits.writemask & WRITEMASK_Y)
966 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
967 if (dst.dw1.bits.writemask & WRITEMASK_Z)
968 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
969 if (dst.dw1.bits.writemask & WRITEMASK_W)
970 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
971 }
972
973
974 static void emit_xpd( struct brw_compile *p,
975 struct brw_reg dst,
976 struct brw_reg t,
977 struct brw_reg u)
978 {
979 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
980 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
981 }
982
983
984 static void emit_lit_noalias( struct brw_vs_compile *c,
985 struct brw_reg dst,
986 struct brw_reg arg0 )
987 {
988 struct brw_compile *p = &c->func;
989 struct brw_instruction *if_insn;
990 struct brw_reg tmp = dst;
991 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
992
993 if (need_tmp)
994 tmp = get_tmp(c);
995
996 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
997 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
998
999 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1000 * to get all channels active inside the IF. In the clipping code
1001 * we run with NoMask, so it's not an option and we can use
1002 * BRW_EXECUTE_1 for all comparisions.
1003 */
1004 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1005 if_insn = brw_IF(p, BRW_EXECUTE_8);
1006 {
1007 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1008
1009 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1010 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1011 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1012
1013 emit_math2(c,
1014 BRW_MATH_FUNCTION_POW,
1015 brw_writemask(dst, WRITEMASK_Z),
1016 brw_swizzle1(tmp, 2),
1017 brw_swizzle1(arg0, 3),
1018 BRW_MATH_PRECISION_PARTIAL);
1019 }
1020
1021 brw_ENDIF(p, if_insn);
1022
1023 release_tmp(c, tmp);
1024 }
1025
1026 static void emit_lrp_noalias(struct brw_vs_compile *c,
1027 struct brw_reg dst,
1028 struct brw_reg arg0,
1029 struct brw_reg arg1,
1030 struct brw_reg arg2)
1031 {
1032 struct brw_compile *p = &c->func;
1033
1034 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1035 brw_MUL(p, brw_null_reg(), dst, arg2);
1036 brw_MAC(p, dst, arg0, arg1);
1037 }
1038
1039 /** 3 or 4-component vector normalization */
1040 static void emit_nrm( struct brw_vs_compile *c,
1041 struct brw_reg dst,
1042 struct brw_reg arg0,
1043 int num_comps)
1044 {
1045 struct brw_compile *p = &c->func;
1046 struct brw_reg tmp = get_tmp(c);
1047
1048 /* tmp = dot(arg0, arg0) */
1049 if (num_comps == 3)
1050 brw_DP3(p, tmp, arg0, arg0);
1051 else
1052 brw_DP4(p, tmp, arg0, arg0);
1053
1054 /* tmp = 1 / sqrt(tmp) */
1055 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1056
1057 /* dst = arg0 * tmp */
1058 brw_MUL(p, dst, arg0, tmp);
1059
1060 release_tmp(c, tmp);
1061 }
1062
1063
1064 static struct brw_reg
1065 get_constant(struct brw_vs_compile *c,
1066 const struct prog_instruction *inst,
1067 GLuint argIndex)
1068 {
1069 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1070 struct brw_compile *p = &c->func;
1071 struct brw_reg const_reg = c->current_const[argIndex].reg;
1072
1073 assert(argIndex < 3);
1074
1075 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1076
1077 if (c->current_const[argIndex].index != src->Index) {
1078 /* Keep track of the last constant loaded in this slot, for reuse. */
1079 c->current_const[argIndex].index = src->Index;
1080
1081 #if 0
1082 printf(" fetch const[%d] for arg %d into reg %d\n",
1083 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1084 #endif
1085 /* need to fetch the constant now */
1086 brw_dp_READ_4_vs(p,
1087 const_reg, /* writeback dest */
1088 16 * src->Index, /* byte offset */
1089 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1090 );
1091 }
1092
1093 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1094 const_reg = stride(const_reg, 0, 4, 0);
1095 const_reg.subnr = 0;
1096
1097 return const_reg;
1098 }
1099
1100 static struct brw_reg
1101 get_reladdr_constant(struct brw_vs_compile *c,
1102 const struct prog_instruction *inst,
1103 GLuint argIndex)
1104 {
1105 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1106 struct brw_compile *p = &c->func;
1107 struct brw_reg const_reg = c->current_const[argIndex].reg;
1108 struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
1109 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1110
1111 assert(argIndex < 3);
1112
1113 assert(c->func.brw->intel.gen < 6); /* FINISHME */
1114
1115 /* Can't reuse a reladdr constant load. */
1116 c->current_const[argIndex].index = -1;
1117
1118 #if 0
1119 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1120 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1121 #endif
1122
1123 brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
1124
1125 /* fetch the first vec4 */
1126 brw_dp_READ_4_vs_relative(p,
1127 const_reg, /* writeback dest */
1128 byte_addr_reg, /* address register */
1129 16 * src->Index, /* byte offset */
1130 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1131 );
1132
1133 return const_reg;
1134 }
1135
1136
1137
1138 /* TODO: relative addressing!
1139 */
1140 static struct brw_reg get_reg( struct brw_vs_compile *c,
1141 gl_register_file file,
1142 GLuint index )
1143 {
1144 switch (file) {
1145 case PROGRAM_TEMPORARY:
1146 case PROGRAM_INPUT:
1147 case PROGRAM_OUTPUT:
1148 assert(c->regs[file][index].nr != 0);
1149 return c->regs[file][index];
1150 case PROGRAM_STATE_VAR:
1151 case PROGRAM_CONSTANT:
1152 case PROGRAM_UNIFORM:
1153 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1154 return c->regs[PROGRAM_STATE_VAR][index];
1155 case PROGRAM_ADDRESS:
1156 assert(index == 0);
1157 return c->regs[file][index];
1158
1159 case PROGRAM_UNDEFINED: /* undef values */
1160 return brw_null_reg();
1161
1162 case PROGRAM_LOCAL_PARAM:
1163 case PROGRAM_ENV_PARAM:
1164 case PROGRAM_WRITE_ONLY:
1165 default:
1166 assert(0);
1167 return brw_null_reg();
1168 }
1169 }
1170
1171
1172 /**
1173 * Indirect addressing: get reg[[arg] + offset].
1174 */
1175 static struct brw_reg deref( struct brw_vs_compile *c,
1176 struct brw_reg arg,
1177 GLint offset,
1178 GLuint reg_size )
1179 {
1180 struct brw_compile *p = &c->func;
1181 struct brw_reg tmp = get_tmp(c);
1182 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1183 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1184 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1185 struct brw_reg indirect = brw_vec4_indirect(0,0);
1186 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1187
1188 /* Set the vertical stride on the register access so that the first
1189 * 4 components come from a0.0 and the second 4 from a0.1.
1190 */
1191 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1192
1193 {
1194 brw_push_insn_state(p);
1195 brw_set_access_mode(p, BRW_ALIGN_1);
1196
1197 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1198 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1199
1200 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1201 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1202
1203 brw_MOV(p, tmp, indirect);
1204
1205 brw_pop_insn_state(p);
1206 }
1207
1208 /* NOTE: tmp not released */
1209 return tmp;
1210 }
1211
1212 static void
1213 move_to_reladdr_dst(struct brw_vs_compile *c,
1214 const struct prog_instruction *inst,
1215 struct brw_reg val)
1216 {
1217 struct brw_compile *p = &c->func;
1218 int reg_size = 32;
1219 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1220 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1221 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1222 GLuint byte_offset = base.nr * 32 + base.subnr;
1223 struct brw_reg indirect = brw_vec4_indirect(0,0);
1224 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1225
1226 /* Because destination register indirect addressing can only use
1227 * one index, we'll write each vertex's vec4 value separately.
1228 */
1229 val.width = BRW_WIDTH_4;
1230 val.vstride = BRW_VERTICAL_STRIDE_4;
1231
1232 brw_push_insn_state(p);
1233 brw_set_access_mode(p, BRW_ALIGN_1);
1234
1235 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1236 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1237 brw_MOV(p, indirect, val);
1238
1239 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1240 brw_ADD(p, brw_address_reg(0), acc,
1241 brw_imm_uw(byte_offset + reg_size / 2));
1242 brw_MOV(p, indirect, suboffset(val, 4));
1243
1244 brw_pop_insn_state(p);
1245 }
1246
1247 /**
1248 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1249 * TODO: relative addressing!
1250 */
1251 static struct brw_reg
1252 get_src_reg( struct brw_vs_compile *c,
1253 const struct prog_instruction *inst,
1254 GLuint argIndex )
1255 {
1256 const GLuint file = inst->SrcReg[argIndex].File;
1257 const GLint index = inst->SrcReg[argIndex].Index;
1258 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1259
1260 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1261 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1262
1263 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1264 SWIZZLE_ZERO,
1265 SWIZZLE_ZERO,
1266 SWIZZLE_ZERO)) {
1267 return brw_imm_f(0.0f);
1268 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1269 SWIZZLE_ONE,
1270 SWIZZLE_ONE,
1271 SWIZZLE_ONE)) {
1272 if (src->Negate)
1273 return brw_imm_f(-1.0F);
1274 else
1275 return brw_imm_f(1.0F);
1276 } else if (src->File == PROGRAM_CONSTANT) {
1277 const struct gl_program_parameter_list *params;
1278 float f;
1279 int component = -1;
1280
1281 switch (src->Swizzle) {
1282 case SWIZZLE_XXXX:
1283 component = 0;
1284 break;
1285 case SWIZZLE_YYYY:
1286 component = 1;
1287 break;
1288 case SWIZZLE_ZZZZ:
1289 component = 2;
1290 break;
1291 case SWIZZLE_WWWW:
1292 component = 3;
1293 break;
1294 }
1295
1296 if (component >= 0) {
1297 params = c->vp->program.Base.Parameters;
1298 f = params->ParameterValues[src->Index][component];
1299
1300 if (src->Abs)
1301 f = fabs(f);
1302 if (src->Negate)
1303 f = -f;
1304 return brw_imm_f(f);
1305 }
1306 }
1307 }
1308
1309 switch (file) {
1310 case PROGRAM_TEMPORARY:
1311 case PROGRAM_INPUT:
1312 case PROGRAM_OUTPUT:
1313 if (relAddr) {
1314 return deref(c, c->regs[file][0], index, 32);
1315 }
1316 else {
1317 assert(c->regs[file][index].nr != 0);
1318 return c->regs[file][index];
1319 }
1320
1321 case PROGRAM_STATE_VAR:
1322 case PROGRAM_CONSTANT:
1323 case PROGRAM_UNIFORM:
1324 case PROGRAM_ENV_PARAM:
1325 case PROGRAM_LOCAL_PARAM:
1326 if (!relAddr && c->constant_map[index] != -1) {
1327 /* Take from the push constant buffer if possible. */
1328 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1329 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1330 } else {
1331 /* Must be in the pull constant buffer then .*/
1332 assert(c->vp->use_const_buffer);
1333 if (relAddr)
1334 return get_reladdr_constant(c, inst, argIndex);
1335 else
1336 return get_constant(c, inst, argIndex);
1337 }
1338 case PROGRAM_ADDRESS:
1339 assert(index == 0);
1340 return c->regs[file][index];
1341
1342 case PROGRAM_UNDEFINED:
1343 /* this is a normal case since we loop over all three src args */
1344 return brw_null_reg();
1345
1346 case PROGRAM_WRITE_ONLY:
1347 default:
1348 assert(0);
1349 return brw_null_reg();
1350 }
1351 }
1352
1353 /**
1354 * Return the brw reg for the given instruction's src argument.
1355 * Will return mangled results for SWZ op. The emit_swz() function
1356 * ignores this result and recalculates taking extended swizzles into
1357 * account.
1358 */
1359 static struct brw_reg get_arg( struct brw_vs_compile *c,
1360 const struct prog_instruction *inst,
1361 GLuint argIndex )
1362 {
1363 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1364 struct brw_reg reg;
1365
1366 if (src->File == PROGRAM_UNDEFINED)
1367 return brw_null_reg();
1368
1369 reg = get_src_reg(c, inst, argIndex);
1370
1371 /* Convert 3-bit swizzle to 2-bit.
1372 */
1373 if (reg.file != BRW_IMMEDIATE_VALUE) {
1374 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1375 GET_SWZ(src->Swizzle, 1),
1376 GET_SWZ(src->Swizzle, 2),
1377 GET_SWZ(src->Swizzle, 3));
1378 }
1379
1380 /* Note this is ok for non-swizzle instructions:
1381 */
1382 reg.negate = src->Negate ? 1 : 0;
1383
1384 return reg;
1385 }
1386
1387
1388 /**
1389 * Get brw register for the given program dest register.
1390 */
1391 static struct brw_reg get_dst( struct brw_vs_compile *c,
1392 struct prog_dst_register dst )
1393 {
1394 struct brw_reg reg;
1395
1396 switch (dst.File) {
1397 case PROGRAM_TEMPORARY:
1398 case PROGRAM_OUTPUT:
1399 /* register-indirect addressing is only 1x1, not VxH, for
1400 * destination regs. So, for RelAddr we'll return a temporary
1401 * for the dest and do a move of the result to the RelAddr
1402 * register after the instruction emit.
1403 */
1404 if (dst.RelAddr) {
1405 reg = get_tmp(c);
1406 } else {
1407 assert(c->regs[dst.File][dst.Index].nr != 0);
1408 reg = c->regs[dst.File][dst.Index];
1409 }
1410 break;
1411 case PROGRAM_ADDRESS:
1412 assert(dst.Index == 0);
1413 reg = c->regs[dst.File][dst.Index];
1414 break;
1415 case PROGRAM_UNDEFINED:
1416 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1417 reg = brw_null_reg();
1418 break;
1419 default:
1420 assert(0);
1421 reg = brw_null_reg();
1422 }
1423
1424 assert(reg.type != BRW_IMMEDIATE_VALUE);
1425 reg.dw1.bits.writemask = dst.WriteMask;
1426
1427 return reg;
1428 }
1429
1430
1431 static void emit_swz( struct brw_vs_compile *c,
1432 struct brw_reg dst,
1433 const struct prog_instruction *inst)
1434 {
1435 const GLuint argIndex = 0;
1436 const struct prog_src_register src = inst->SrcReg[argIndex];
1437 struct brw_compile *p = &c->func;
1438 GLuint zeros_mask = 0;
1439 GLuint ones_mask = 0;
1440 GLuint src_mask = 0;
1441 GLubyte src_swz[4];
1442 GLboolean need_tmp = (src.Negate &&
1443 dst.file != BRW_GENERAL_REGISTER_FILE);
1444 struct brw_reg tmp = dst;
1445 GLuint i;
1446
1447 if (need_tmp)
1448 tmp = get_tmp(c);
1449
1450 for (i = 0; i < 4; i++) {
1451 if (dst.dw1.bits.writemask & (1<<i)) {
1452 GLubyte s = GET_SWZ(src.Swizzle, i);
1453 switch (s) {
1454 case SWIZZLE_X:
1455 case SWIZZLE_Y:
1456 case SWIZZLE_Z:
1457 case SWIZZLE_W:
1458 src_mask |= 1<<i;
1459 src_swz[i] = s;
1460 break;
1461 case SWIZZLE_ZERO:
1462 zeros_mask |= 1<<i;
1463 break;
1464 case SWIZZLE_ONE:
1465 ones_mask |= 1<<i;
1466 break;
1467 }
1468 }
1469 }
1470
1471 /* Do src first, in case dst aliases src:
1472 */
1473 if (src_mask) {
1474 struct brw_reg arg0;
1475
1476 arg0 = get_src_reg(c, inst, argIndex);
1477
1478 arg0 = brw_swizzle(arg0,
1479 src_swz[0], src_swz[1],
1480 src_swz[2], src_swz[3]);
1481
1482 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1483 }
1484
1485 if (zeros_mask)
1486 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1487
1488 if (ones_mask)
1489 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1490
1491 if (src.Negate)
1492 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1493
1494 if (need_tmp) {
1495 brw_MOV(p, dst, tmp);
1496 release_tmp(c, tmp);
1497 }
1498 }
1499
1500
1501 /**
1502 * Post-vertex-program processing. Send the results to the URB.
1503 */
1504 static void emit_vertex_write( struct brw_vs_compile *c)
1505 {
1506 struct brw_compile *p = &c->func;
1507 struct brw_context *brw = p->brw;
1508 struct intel_context *intel = &brw->intel;
1509 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1510 struct brw_reg ndc;
1511 int eot;
1512 GLuint len_vertex_header = 2;
1513 int next_mrf, i;
1514
1515 if (c->key.copy_edgeflag) {
1516 brw_MOV(p,
1517 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1518 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1519 }
1520
1521 if (intel->gen < 6) {
1522 /* Build ndc coords */
1523 ndc = get_tmp(c);
1524 /* ndc = 1.0 / pos.w */
1525 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1526 /* ndc.xyz = pos * ndc */
1527 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1528 }
1529
1530 /* Update the header for point size, user clipping flags, and -ve rhw
1531 * workaround.
1532 */
1533 if (intel->gen >= 6) {
1534 struct brw_reg m1 = brw_message_reg(1);
1535
1536 /* On gen6, m1 has each value in a separate dword, so we never
1537 * need to mess with a temporary for computing the m1 value.
1538 */
1539 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1540 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1541 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1542 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1543 }
1544
1545 /* Set the user clip distances in dword 8-15. (m3-4)*/
1546 if (c->key.nr_userclip) {
1547 for (i = 0; i < c->key.nr_userclip; i++) {
1548 struct brw_reg m;
1549 if (i < 4)
1550 m = brw_message_reg(3);
1551 else
1552 m = brw_message_reg(4);
1553
1554 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1555 }
1556 }
1557 } else if ((c->prog_data.outputs_written &
1558 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1559 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1560 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1561 GLuint i;
1562
1563 brw_MOV(p, header1, brw_imm_ud(0));
1564
1565 brw_set_access_mode(p, BRW_ALIGN_16);
1566
1567 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1568 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1569 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1570 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1571 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1572 header1, brw_imm_ud(0x7ff<<8));
1573 }
1574
1575 for (i = 0; i < c->key.nr_userclip; i++) {
1576 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1577 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1578 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1579 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1580 }
1581
1582 /* i965 clipping workaround:
1583 * 1) Test for -ve rhw
1584 * 2) If set,
1585 * set ndc = (0,0,0,0)
1586 * set ucp[6] = 1
1587 *
1588 * Later, clipping will detect ucp[6] and ensure the primitive is
1589 * clipped against all fixed planes.
1590 */
1591 if (brw->has_negative_rhw_bug) {
1592 brw_CMP(p,
1593 vec8(brw_null_reg()),
1594 BRW_CONDITIONAL_L,
1595 brw_swizzle1(ndc, 3),
1596 brw_imm_f(0));
1597
1598 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1599 brw_MOV(p, ndc, brw_imm_f(0));
1600 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1601 }
1602
1603 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1604 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1605 brw_set_access_mode(p, BRW_ALIGN_16);
1606
1607 release_tmp(c, header1);
1608 }
1609 else {
1610 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1611 }
1612
1613 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1614 * of zeros followed by two sets of NDC coordinates:
1615 */
1616 brw_set_access_mode(p, BRW_ALIGN_1);
1617 brw_set_acc_write_control(p, 0);
1618
1619 /* The VUE layout is documented in Volume 2a. */
1620 if (intel->gen >= 6) {
1621 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1622 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1623 * dword 4-7 (m2) is the 4D space position
1624 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1625 * enabled.
1626 * m3 or 5 is the first vertex element data we fill, which is
1627 * the vertex position.
1628 */
1629 brw_MOV(p, brw_message_reg(2), pos);
1630 len_vertex_header = 1;
1631 if (c->key.nr_userclip > 0)
1632 len_vertex_header += 2;
1633 } else if (intel->gen == 5) {
1634 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1635 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1636 * dword 4-7 (m2) is the ndc position (set above)
1637 * dword 8-11 (m3) of the vertex header is the 4D space position
1638 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1639 * m6 is a pad so that the vertex element data is aligned
1640 * m7 is the first vertex data we fill, which is the vertex position.
1641 */
1642 brw_MOV(p, brw_message_reg(2), ndc);
1643 brw_MOV(p, brw_message_reg(3), pos);
1644 brw_MOV(p, brw_message_reg(7), pos);
1645 len_vertex_header = 6;
1646 } else {
1647 /* There are 8 dwords in VUE header pre-Ironlake:
1648 * dword 0-3 (m1) is indices, point width, clip flags.
1649 * dword 4-7 (m2) is ndc position (set above)
1650 *
1651 * dword 8-11 (m3) is the first vertex data, which we always have be the
1652 * vertex position.
1653 */
1654 brw_MOV(p, brw_message_reg(2), ndc);
1655 brw_MOV(p, brw_message_reg(3), pos);
1656 len_vertex_header = 2;
1657 }
1658
1659 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1660 next_mrf = 2 + len_vertex_header;
1661 for (i = 0; i < VERT_RESULT_MAX; i++) {
1662 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1663 break;
1664 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1665 continue;
1666 if (i == VERT_RESULT_PSIZ)
1667 continue;
1668
1669 if (i >= VERT_RESULT_TEX0 &&
1670 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1671 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1672 next_mrf++;
1673 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1674 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1675 }
1676 }
1677
1678 eot = (c->first_overflow_output == 0);
1679
1680 brw_urb_WRITE(p,
1681 brw_null_reg(), /* dest */
1682 0, /* starting mrf reg nr */
1683 c->r0, /* src */
1684 0, /* allocate */
1685 1, /* used */
1686 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
1687 0, /* response len */
1688 eot, /* eot */
1689 eot, /* writes complete */
1690 0, /* urb destination offset */
1691 BRW_URB_SWIZZLE_INTERLEAVE);
1692
1693 if (c->first_overflow_output > 0) {
1694 /* Not all of the vertex outputs/results fit into the MRF.
1695 * Move the overflowed attributes from the GRF to the MRF and
1696 * issue another brw_urb_WRITE().
1697 */
1698 GLuint i, mrf = 1;
1699 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1700 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1701 /* move from GRF to MRF */
1702 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1703 mrf++;
1704 }
1705 }
1706
1707 brw_urb_WRITE(p,
1708 brw_null_reg(), /* dest */
1709 0, /* starting mrf reg nr */
1710 c->r0, /* src */
1711 0, /* allocate */
1712 1, /* used */
1713 mrf, /* msg len */
1714 0, /* response len */
1715 1, /* eot */
1716 1, /* writes complete */
1717 14 / 2, /* urb destination offset */
1718 BRW_URB_SWIZZLE_INTERLEAVE);
1719 }
1720 }
1721
1722 static GLboolean
1723 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1724 {
1725 struct brw_compile *p = &c->func;
1726 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1727
1728 if (p->nr_insn == 0)
1729 return GL_FALSE;
1730
1731 if (val.address_mode != BRW_ADDRESS_DIRECT)
1732 return GL_FALSE;
1733
1734 switch (prev_insn->header.opcode) {
1735 case BRW_OPCODE_MOV:
1736 case BRW_OPCODE_MAC:
1737 case BRW_OPCODE_MUL:
1738 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1739 prev_insn->header.execution_size == val.width &&
1740 prev_insn->bits1.da1.dest_reg_file == val.file &&
1741 prev_insn->bits1.da1.dest_reg_type == val.type &&
1742 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1743 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1744 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1745 prev_insn->bits1.da16.dest_writemask == 0xf)
1746 return GL_TRUE;
1747 else
1748 return GL_FALSE;
1749 default:
1750 return GL_FALSE;
1751 }
1752 }
1753
1754 static uint32_t
1755 get_predicate(const struct prog_instruction *inst)
1756 {
1757 if (inst->DstReg.CondMask == COND_TR)
1758 return BRW_PREDICATE_NONE;
1759
1760 /* All of GLSL only produces predicates for COND_NE and one channel per
1761 * vector. Fail badly if someone starts doing something else, as it might
1762 * mean infinite looping or something.
1763 *
1764 * We'd like to support all the condition codes, but our hardware doesn't
1765 * quite match the Mesa IR, which is modeled after the NV extensions. For
1766 * those, the instruction may update the condition codes or not, then any
1767 * later instruction may use one of those condition codes. For gen4, the
1768 * instruction may update the flags register based on one of the condition
1769 * codes output by the instruction, and then further instructions may
1770 * predicate on that. We can probably support this, but it won't
1771 * necessarily be easy.
1772 */
1773 assert(inst->DstReg.CondMask == COND_NE);
1774
1775 switch (inst->DstReg.CondSwizzle) {
1776 case SWIZZLE_XXXX:
1777 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1778 case SWIZZLE_YYYY:
1779 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1780 case SWIZZLE_ZZZZ:
1781 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1782 case SWIZZLE_WWWW:
1783 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1784 default:
1785 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1786 inst->DstReg.CondMask);
1787 return BRW_PREDICATE_NORMAL;
1788 }
1789 }
1790
1791 /* Emit the vertex program instructions here.
1792 */
1793 void brw_vs_emit(struct brw_vs_compile *c )
1794 {
1795 #define MAX_IF_DEPTH 32
1796 #define MAX_LOOP_DEPTH 32
1797 struct brw_compile *p = &c->func;
1798 struct brw_context *brw = p->brw;
1799 struct intel_context *intel = &brw->intel;
1800 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1801 GLuint insn, if_depth = 0, loop_depth = 0;
1802 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1803 int if_depth_in_loop[MAX_LOOP_DEPTH];
1804 const struct brw_indirect stack_index = brw_indirect(0, 0);
1805 GLuint index;
1806 GLuint file;
1807
1808 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1809 printf("vs-mesa:\n");
1810 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1811 GL_TRUE);
1812 printf("\n");
1813 }
1814
1815 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1816 brw_set_access_mode(p, BRW_ALIGN_16);
1817 if_depth_in_loop[loop_depth] = 0;
1818
1819 brw_set_acc_write_control(p, 1);
1820
1821 for (insn = 0; insn < nr_insns; insn++) {
1822 GLuint i;
1823 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1824
1825 /* Message registers can't be read, so copy the output into GRF
1826 * register if they are used in source registers
1827 */
1828 for (i = 0; i < 3; i++) {
1829 struct prog_src_register *src = &inst->SrcReg[i];
1830 GLuint index = src->Index;
1831 GLuint file = src->File;
1832 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1833 c->output_regs[index].used_in_src = GL_TRUE;
1834 }
1835
1836 switch (inst->Opcode) {
1837 case OPCODE_CAL:
1838 case OPCODE_RET:
1839 c->needs_stack = GL_TRUE;
1840 break;
1841 default:
1842 break;
1843 }
1844 }
1845
1846 /* Static register allocation
1847 */
1848 brw_vs_alloc_regs(c);
1849
1850 if (c->needs_stack)
1851 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1852
1853 for (insn = 0; insn < nr_insns; insn++) {
1854
1855 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1856 struct brw_reg args[3], dst;
1857 GLuint i;
1858
1859 #if 0
1860 printf("%d: ", insn);
1861 _mesa_print_instruction(inst);
1862 #endif
1863
1864 /* Get argument regs. SWZ is special and does this itself.
1865 */
1866 if (inst->Opcode != OPCODE_SWZ)
1867 for (i = 0; i < 3; i++) {
1868 const struct prog_src_register *src = &inst->SrcReg[i];
1869 index = src->Index;
1870 file = src->File;
1871 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1872 args[i] = c->output_regs[index].reg;
1873 else
1874 args[i] = get_arg(c, inst, i);
1875 }
1876
1877 /* Get dest regs. Note that it is possible for a reg to be both
1878 * dst and arg, given the static allocation of registers. So
1879 * care needs to be taken emitting multi-operation instructions.
1880 */
1881 index = inst->DstReg.Index;
1882 file = inst->DstReg.File;
1883 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1884 dst = c->output_regs[index].reg;
1885 else
1886 dst = get_dst(c, inst->DstReg);
1887
1888 if (inst->SaturateMode != SATURATE_OFF) {
1889 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1890 inst->SaturateMode);
1891 }
1892
1893 switch (inst->Opcode) {
1894 case OPCODE_ABS:
1895 brw_MOV(p, dst, brw_abs(args[0]));
1896 break;
1897 case OPCODE_ADD:
1898 brw_ADD(p, dst, args[0], args[1]);
1899 break;
1900 case OPCODE_COS:
1901 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1902 break;
1903 case OPCODE_DP2:
1904 brw_DP2(p, dst, args[0], args[1]);
1905 break;
1906 case OPCODE_DP3:
1907 brw_DP3(p, dst, args[0], args[1]);
1908 break;
1909 case OPCODE_DP4:
1910 brw_DP4(p, dst, args[0], args[1]);
1911 break;
1912 case OPCODE_DPH:
1913 brw_DPH(p, dst, args[0], args[1]);
1914 break;
1915 case OPCODE_NRM3:
1916 emit_nrm(c, dst, args[0], 3);
1917 break;
1918 case OPCODE_NRM4:
1919 emit_nrm(c, dst, args[0], 4);
1920 break;
1921 case OPCODE_DST:
1922 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1923 break;
1924 case OPCODE_EXP:
1925 unalias1(c, dst, args[0], emit_exp_noalias);
1926 break;
1927 case OPCODE_EX2:
1928 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1929 break;
1930 case OPCODE_ARL:
1931 brw_RNDD(p, dst, args[0]);
1932 break;
1933 case OPCODE_FLR:
1934 brw_RNDD(p, dst, args[0]);
1935 break;
1936 case OPCODE_FRC:
1937 brw_FRC(p, dst, args[0]);
1938 break;
1939 case OPCODE_LOG:
1940 unalias1(c, dst, args[0], emit_log_noalias);
1941 break;
1942 case OPCODE_LG2:
1943 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
1944 break;
1945 case OPCODE_LIT:
1946 unalias1(c, dst, args[0], emit_lit_noalias);
1947 break;
1948 case OPCODE_LRP:
1949 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
1950 break;
1951 case OPCODE_MAD:
1952 if (!accumulator_contains(c, args[2]))
1953 brw_MOV(p, brw_acc_reg(), args[2]);
1954 brw_MAC(p, dst, args[0], args[1]);
1955 break;
1956 case OPCODE_CMP:
1957 emit_cmp(p, dst, args[0], args[1], args[2]);
1958 break;
1959 case OPCODE_MAX:
1960 emit_max(p, dst, args[0], args[1]);
1961 break;
1962 case OPCODE_MIN:
1963 emit_min(p, dst, args[0], args[1]);
1964 break;
1965 case OPCODE_MOV:
1966 brw_MOV(p, dst, args[0]);
1967 break;
1968 case OPCODE_MUL:
1969 brw_MUL(p, dst, args[0], args[1]);
1970 break;
1971 case OPCODE_POW:
1972 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
1973 break;
1974 case OPCODE_RCP:
1975 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
1976 break;
1977 case OPCODE_RSQ:
1978 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
1979 break;
1980
1981 case OPCODE_SEQ:
1982 unalias2(c, dst, args[0], args[1], emit_seq);
1983 break;
1984 case OPCODE_SIN:
1985 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
1986 break;
1987 case OPCODE_SNE:
1988 unalias2(c, dst, args[0], args[1], emit_sne);
1989 break;
1990 case OPCODE_SGE:
1991 unalias2(c, dst, args[0], args[1], emit_sge);
1992 break;
1993 case OPCODE_SGT:
1994 unalias2(c, dst, args[0], args[1], emit_sgt);
1995 break;
1996 case OPCODE_SLT:
1997 unalias2(c, dst, args[0], args[1], emit_slt);
1998 break;
1999 case OPCODE_SLE:
2000 unalias2(c, dst, args[0], args[1], emit_sle);
2001 break;
2002 case OPCODE_SSG:
2003 unalias1(c, dst, args[0], emit_sign);
2004 break;
2005 case OPCODE_SUB:
2006 brw_ADD(p, dst, args[0], negate(args[1]));
2007 break;
2008 case OPCODE_SWZ:
2009 /* The args[0] value can't be used here as it won't have
2010 * correctly encoded the full swizzle:
2011 */
2012 emit_swz(c, dst, inst);
2013 break;
2014 case OPCODE_TRUNC:
2015 /* round toward zero */
2016 brw_RNDZ(p, dst, args[0]);
2017 break;
2018 case OPCODE_XPD:
2019 emit_xpd(p, dst, args[0], args[1]);
2020 break;
2021 case OPCODE_IF:
2022 assert(if_depth < MAX_IF_DEPTH);
2023 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
2024 /* Note that brw_IF smashes the predicate_control field. */
2025 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
2026 if_depth_in_loop[loop_depth]++;
2027 if_depth++;
2028 break;
2029 case OPCODE_ELSE:
2030 clear_current_const(c);
2031 assert(if_depth > 0);
2032 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2033 break;
2034 case OPCODE_ENDIF:
2035 clear_current_const(c);
2036 assert(if_depth > 0);
2037 brw_ENDIF(p, if_inst[--if_depth]);
2038 if_depth_in_loop[loop_depth]--;
2039 break;
2040 case OPCODE_BGNLOOP:
2041 clear_current_const(c);
2042 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2043 if_depth_in_loop[loop_depth] = 0;
2044 break;
2045 case OPCODE_BRK:
2046 brw_set_predicate_control(p, get_predicate(inst));
2047 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2048 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2049 break;
2050 case OPCODE_CONT:
2051 brw_set_predicate_control(p, get_predicate(inst));
2052 if (intel->gen >= 6) {
2053 brw_CONT_gen6(p, loop_inst[loop_depth - 1]);
2054 } else {
2055 brw_CONT(p, if_depth_in_loop[loop_depth]);
2056 }
2057 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2058 break;
2059
2060 case OPCODE_ENDLOOP: {
2061 clear_current_const(c);
2062 struct brw_instruction *inst0, *inst1;
2063 GLuint br = 1;
2064
2065 loop_depth--;
2066
2067 if (intel->gen == 5)
2068 br = 2;
2069
2070 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2071
2072 if (intel->gen < 6) {
2073 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2074 while (inst0 > loop_inst[loop_depth]) {
2075 inst0--;
2076 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2077 inst0->bits3.if_else.jump_count == 0) {
2078 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2079 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2080 inst0->bits3.if_else.jump_count == 0) {
2081 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2082 }
2083 }
2084 }
2085 }
2086 break;
2087
2088 case OPCODE_BRA:
2089 brw_set_predicate_control(p, get_predicate(inst));
2090 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2091 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2092 break;
2093 case OPCODE_CAL:
2094 brw_set_access_mode(p, BRW_ALIGN_1);
2095 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2096 brw_set_access_mode(p, BRW_ALIGN_16);
2097 brw_ADD(p, get_addr_reg(stack_index),
2098 get_addr_reg(stack_index), brw_imm_d(4));
2099 brw_save_call(p, inst->Comment, p->nr_insn);
2100 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2101 break;
2102 case OPCODE_RET:
2103 brw_ADD(p, get_addr_reg(stack_index),
2104 get_addr_reg(stack_index), brw_imm_d(-4));
2105 brw_set_access_mode(p, BRW_ALIGN_1);
2106 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2107 brw_set_access_mode(p, BRW_ALIGN_16);
2108 break;
2109 case OPCODE_END:
2110 emit_vertex_write(c);
2111 break;
2112 case OPCODE_PRINT:
2113 /* no-op */
2114 break;
2115 case OPCODE_BGNSUB:
2116 brw_save_label(p, inst->Comment, p->nr_insn);
2117 break;
2118 case OPCODE_ENDSUB:
2119 /* no-op */
2120 break;
2121 default:
2122 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2123 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2124 _mesa_opcode_string(inst->Opcode) :
2125 "unknown");
2126 }
2127
2128 /* Set the predication update on the last instruction of the native
2129 * instruction sequence.
2130 *
2131 * This would be problematic if it was set on a math instruction,
2132 * but that shouldn't be the case with the current GLSL compiler.
2133 */
2134 if (inst->CondUpdate) {
2135 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2136
2137 assert(hw_insn->header.destreg__conditionalmod == 0);
2138 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2139 }
2140
2141 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2142 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2143 && c->output_regs[inst->DstReg.Index].used_in_src) {
2144 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2145 }
2146
2147 /* Result color clamping.
2148 *
2149 * When destination register is an output register and
2150 * it's primary/secondary front/back color, we have to clamp
2151 * the result to [0,1]. This is done by enabling the
2152 * saturation bit for the last instruction.
2153 *
2154 * We don't use brw_set_saturate() as it modifies
2155 * p->current->header.saturate, which affects all the subsequent
2156 * instructions. Instead, we directly modify the header
2157 * of the last (already stored) instruction.
2158 */
2159 if (inst->DstReg.File == PROGRAM_OUTPUT) {
2160 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2161 || (inst->DstReg.Index == VERT_RESULT_COL1)
2162 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2163 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2164 p->store[p->nr_insn-1].header.saturate = 1;
2165 }
2166 }
2167
2168 if (inst->DstReg.RelAddr) {
2169 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2170 inst->DstReg.File == PROGRAM_OUTPUT);
2171 move_to_reladdr_dst(c, inst, dst);
2172 }
2173
2174 release_tmps(c);
2175 }
2176
2177 brw_resolve_cals(p);
2178 brw_set_uip_jip(p);
2179
2180 brw_optimize(p);
2181
2182 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2183 int i;
2184
2185 printf("vs-native:\n");
2186 for (i = 0; i < p->nr_insn; i++)
2187 brw_disasm(stdout, &p->store[i], intel->gen);
2188 printf("\n");
2189 }
2190 }