Merge remote-tracking branch 'origin/master' into pipe-video
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf, j;
144 int attributes_in_vue;
145 int first_reladdr_output;
146 int max_constant;
147 int constant = 0;
148 int vert_result_reoder[VERT_RESULT_MAX];
149 int bfc = 0;
150
151 /* Determine whether to use a real constant buffer or use a block
152 * of GRF registers for constants. The later is faster but only
153 * works if everything fits in the GRF.
154 * XXX this heuristic/check may need some fine tuning...
155 */
156 if (c->vp->program.Base.Parameters->NumParameters +
157 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
158 c->vp->use_const_buffer = GL_TRUE;
159 else
160 c->vp->use_const_buffer = GL_FALSE;
161
162 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
163
164 /* r0 -- reserved as usual
165 */
166 c->r0 = brw_vec8_grf(reg, 0);
167 reg++;
168
169 /* User clip planes from curbe:
170 */
171 if (c->key.nr_userclip) {
172 if (intel->gen >= 6) {
173 for (i = 0; i < c->key.nr_userclip; i++) {
174 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
175 (i % 2) * 4), 0, 4, 1);
176 }
177 reg += ALIGN(c->key.nr_userclip, 2) / 2;
178 } else {
179 for (i = 0; i < c->key.nr_userclip; i++) {
180 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
181 (i % 2) * 4), 0, 4, 1);
182 }
183 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
184 }
185
186 }
187
188 /* Assign some (probably all) of the vertex program constants to
189 * the push constant buffer/CURBE.
190 *
191 * There's an obvious limit to the numer of push constants equal to
192 * the number of register available, and that number is smaller
193 * than the minimum maximum number of vertex program parameters, so
194 * support for pull constants is required if we overflow.
195 * Additionally, on gen6 the number of push constants is even
196 * lower.
197 *
198 * When there's relative addressing, we don't know what range of
199 * Mesa IR registers can be accessed. And generally, when relative
200 * addressing is used we also have too many constants to load them
201 * all as push constants. So, we'll just support relative
202 * addressing out of the pull constant buffers, and try to load as
203 * many statically-accessed constants into the push constant buffer
204 * as we can.
205 */
206 if (intel->gen >= 6) {
207 /* We can only load 32 regs of push constants. */
208 max_constant = 32 * 2 - c->key.nr_userclip;
209 } else {
210 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
211 }
212
213 /* constant_map maps from ParameterValues[] index to index in the
214 * push constant buffer, or -1 if it's only in the pull constant
215 * buffer.
216 */
217 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
218 for (i = 0;
219 i < c->vp->program.Base.NumInstructions && constant < max_constant;
220 i++) {
221 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
222 int arg;
223
224 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
225 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
226 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
227 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
228 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
229 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
230 continue;
231 }
232
233 if (inst->SrcReg[arg].RelAddr) {
234 c->vp->use_const_buffer = GL_TRUE;
235 continue;
236 }
237
238 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
239 c->constant_map[inst->SrcReg[arg].Index] = constant++;
240 }
241 }
242 }
243
244 /* If we ran out of push constant space, then we'll also upload all
245 * constants through the pull constant buffer so that they can be
246 * accessed no matter what. For relative addressing (the common
247 * case) we need them all in place anyway.
248 */
249 if (constant == max_constant)
250 c->vp->use_const_buffer = GL_TRUE;
251
252 for (i = 0; i < constant; i++) {
253 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
254 (i % 2) * 4),
255 0, 4, 1);
256 }
257 reg += (constant + 1) / 2;
258 c->prog_data.curb_read_length = reg - 1;
259 c->prog_data.nr_params = constant * 4;
260 /* XXX 0 causes a bug elsewhere... */
261 if (intel->gen < 6 && c->prog_data.nr_params == 0)
262 c->prog_data.nr_params = 4;
263
264 /* Allocate input regs:
265 */
266 c->nr_inputs = 0;
267 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
268 if (c->prog_data.inputs_read & (1 << i)) {
269 c->nr_inputs++;
270 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
271 reg++;
272 }
273 }
274 /* If there are no inputs, we'll still be reading one attribute's worth
275 * because it's required -- see urb_read_length setting.
276 */
277 if (c->nr_inputs == 0)
278 reg++;
279
280 /* Allocate outputs. The non-position outputs go straight into message regs.
281 */
282 c->nr_outputs = 0;
283 c->first_output = reg;
284 c->first_overflow_output = 0;
285
286 if (intel->gen >= 6) {
287 mrf = 3;
288 if (c->key.nr_userclip)
289 mrf += 2;
290 } else if (intel->gen == 5)
291 mrf = 8;
292 else
293 mrf = 4;
294
295 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
296
297 for (i = 0; i < VERT_RESULT_MAX; i++)
298 vert_result_reoder[i] = i;
299
300 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
301 if (intel->gen >= 6 && c->key.two_side_color) {
302 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
303 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
304 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
305 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
306 bfc = 2;
307 } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
308 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
309 bfc = 1;
310
311 if (bfc) {
312 for (i = 0; i < bfc; i++) {
313 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
314 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
315 }
316
317 for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
318 vert_result_reoder[i] = i - bfc;
319 }
320 }
321 }
322
323 for (j = 0; j < VERT_RESULT_MAX; j++) {
324 i = vert_result_reoder[j];
325
326 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
327 c->nr_outputs++;
328 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
329 if (i == VERT_RESULT_HPOS) {
330 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
331 reg++;
332 }
333 else if (i == VERT_RESULT_PSIZ) {
334 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
335 reg++;
336 }
337 else {
338 /* Two restrictions on our compute-to-MRF here. The
339 * message length for all SEND messages is restricted to
340 * [1,15], so we can't use mrf 15, as that means a length
341 * of 16.
342 *
343 * Additionally, URB writes are aligned to URB rows, so we
344 * need to put an even number of registers of URB data in
345 * each URB write so that the later write is aligned. A
346 * message length of 15 means 1 message header reg plus 14
347 * regs of URB data.
348 *
349 * For attributes beyond the compute-to-MRF, we compute to
350 * GRFs and they will be written in the second URB_WRITE.
351 */
352 if (first_reladdr_output > i && mrf < 15) {
353 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
354 mrf++;
355 }
356 else {
357 if (mrf >= 15 && !c->first_overflow_output)
358 c->first_overflow_output = i;
359 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
360 reg++;
361 mrf++;
362 }
363 }
364 }
365 }
366
367 /* Allocate program temporaries:
368 */
369 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
370 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
371 reg++;
372 }
373
374 /* Address reg(s). Don't try to use the internal address reg until
375 * deref time.
376 */
377 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
378 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
379 reg,
380 0,
381 BRW_REGISTER_TYPE_D,
382 BRW_VERTICAL_STRIDE_8,
383 BRW_WIDTH_8,
384 BRW_HORIZONTAL_STRIDE_1,
385 BRW_SWIZZLE_XXXX,
386 WRITEMASK_X);
387 reg++;
388 }
389
390 if (c->vp->use_const_buffer) {
391 for (i = 0; i < 3; i++) {
392 c->current_const[i].reg = brw_vec8_grf(reg, 0);
393 reg++;
394 }
395 clear_current_const(c);
396 }
397
398 for (i = 0; i < 128; i++) {
399 if (c->output_regs[i].used_in_src) {
400 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
401 reg++;
402 }
403 }
404
405 if (c->needs_stack) {
406 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
407 reg += 2;
408 }
409
410 /* Some opcodes need an internal temporary:
411 */
412 c->first_tmp = reg;
413 c->last_tmp = reg; /* for allocation purposes */
414
415 /* Each input reg holds data from two vertices. The
416 * urb_read_length is the number of registers read from *each*
417 * vertex urb, so is half the amount:
418 */
419 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
420 /* Setting this field to 0 leads to undefined behavior according to the
421 * the VS_STATE docs. Our VUEs will always have at least one attribute
422 * sitting in them, even if it's padding.
423 */
424 if (c->prog_data.urb_read_length == 0)
425 c->prog_data.urb_read_length = 1;
426
427 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
428 * them to fit the biggest thing they need to.
429 */
430 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
431
432 /* See emit_vertex_write() for where the VUE's overhead on top of the
433 * attributes comes from.
434 */
435 if (intel->gen >= 6) {
436 int header_regs = 2;
437 if (c->key.nr_userclip)
438 header_regs += 2;
439
440 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
441 * number of 128-byte (1024-bit) units.
442 */
443 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
444 } else if (intel->gen == 5)
445 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
446 * number of 64-byte (512-bit) units.
447 */
448 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
449 else
450 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
451
452 c->prog_data.total_grf = reg;
453
454 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
455 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
456 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
457 printf("%s reg = %d\n", __FUNCTION__, reg);
458 }
459 }
460
461
462 /**
463 * If an instruction uses a temp reg both as a src and the dest, we
464 * sometimes need to allocate an intermediate temporary.
465 */
466 static void unalias1( struct brw_vs_compile *c,
467 struct brw_reg dst,
468 struct brw_reg arg0,
469 void (*func)( struct brw_vs_compile *,
470 struct brw_reg,
471 struct brw_reg ))
472 {
473 if (dst.file == arg0.file && dst.nr == arg0.nr) {
474 struct brw_compile *p = &c->func;
475 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
476 func(c, tmp, arg0);
477 brw_MOV(p, dst, tmp);
478 release_tmp(c, tmp);
479 }
480 else {
481 func(c, dst, arg0);
482 }
483 }
484
485 /**
486 * \sa unalias2
487 * Checkes if 2-operand instruction needs an intermediate temporary.
488 */
489 static void unalias2( struct brw_vs_compile *c,
490 struct brw_reg dst,
491 struct brw_reg arg0,
492 struct brw_reg arg1,
493 void (*func)( struct brw_vs_compile *,
494 struct brw_reg,
495 struct brw_reg,
496 struct brw_reg ))
497 {
498 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
499 (dst.file == arg1.file && dst.nr == arg1.nr)) {
500 struct brw_compile *p = &c->func;
501 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
502 func(c, tmp, arg0, arg1);
503 brw_MOV(p, dst, tmp);
504 release_tmp(c, tmp);
505 }
506 else {
507 func(c, dst, arg0, arg1);
508 }
509 }
510
511 /**
512 * \sa unalias2
513 * Checkes if 3-operand instruction needs an intermediate temporary.
514 */
515 static void unalias3( struct brw_vs_compile *c,
516 struct brw_reg dst,
517 struct brw_reg arg0,
518 struct brw_reg arg1,
519 struct brw_reg arg2,
520 void (*func)( struct brw_vs_compile *,
521 struct brw_reg,
522 struct brw_reg,
523 struct brw_reg,
524 struct brw_reg ))
525 {
526 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
527 (dst.file == arg1.file && dst.nr == arg1.nr) ||
528 (dst.file == arg2.file && dst.nr == arg2.nr)) {
529 struct brw_compile *p = &c->func;
530 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
531 func(c, tmp, arg0, arg1, arg2);
532 brw_MOV(p, dst, tmp);
533 release_tmp(c, tmp);
534 }
535 else {
536 func(c, dst, arg0, arg1, arg2);
537 }
538 }
539
540 static void emit_sop( struct brw_vs_compile *c,
541 struct brw_reg dst,
542 struct brw_reg arg0,
543 struct brw_reg arg1,
544 GLuint cond)
545 {
546 struct brw_compile *p = &c->func;
547
548 brw_MOV(p, dst, brw_imm_f(0.0f));
549 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
550 brw_MOV(p, dst, brw_imm_f(1.0f));
551 brw_set_predicate_control_flag_value(p, 0xff);
552 }
553
554 static void emit_seq( struct brw_vs_compile *c,
555 struct brw_reg dst,
556 struct brw_reg arg0,
557 struct brw_reg arg1 )
558 {
559 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
560 }
561
562 static void emit_sne( struct brw_vs_compile *c,
563 struct brw_reg dst,
564 struct brw_reg arg0,
565 struct brw_reg arg1 )
566 {
567 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
568 }
569 static void emit_slt( struct brw_vs_compile *c,
570 struct brw_reg dst,
571 struct brw_reg arg0,
572 struct brw_reg arg1 )
573 {
574 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
575 }
576
577 static void emit_sle( struct brw_vs_compile *c,
578 struct brw_reg dst,
579 struct brw_reg arg0,
580 struct brw_reg arg1 )
581 {
582 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
583 }
584
585 static void emit_sgt( struct brw_vs_compile *c,
586 struct brw_reg dst,
587 struct brw_reg arg0,
588 struct brw_reg arg1 )
589 {
590 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
591 }
592
593 static void emit_sge( struct brw_vs_compile *c,
594 struct brw_reg dst,
595 struct brw_reg arg0,
596 struct brw_reg arg1 )
597 {
598 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
599 }
600
601 static void emit_cmp( struct brw_compile *p,
602 struct brw_reg dst,
603 struct brw_reg arg0,
604 struct brw_reg arg1,
605 struct brw_reg arg2 )
606 {
607 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
608 brw_SEL(p, dst, arg1, arg2);
609 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
610 }
611
612 static void emit_sign(struct brw_vs_compile *c,
613 struct brw_reg dst,
614 struct brw_reg arg0)
615 {
616 struct brw_compile *p = &c->func;
617
618 brw_MOV(p, dst, brw_imm_f(0));
619
620 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
621 brw_MOV(p, dst, brw_imm_f(-1.0));
622 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
623
624 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
625 brw_MOV(p, dst, brw_imm_f(1.0));
626 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
627 }
628
629 static void emit_max( struct brw_compile *p,
630 struct brw_reg dst,
631 struct brw_reg arg0,
632 struct brw_reg arg1 )
633 {
634 struct intel_context *intel = &p->brw->intel;
635
636 if (intel->gen >= 6) {
637 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
638 brw_SEL(p, dst, arg0, arg1);
639 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
640 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
641 } else {
642 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
643 brw_SEL(p, dst, arg0, arg1);
644 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
645 }
646 }
647
648 static void emit_min( struct brw_compile *p,
649 struct brw_reg dst,
650 struct brw_reg arg0,
651 struct brw_reg arg1 )
652 {
653 struct intel_context *intel = &p->brw->intel;
654
655 if (intel->gen >= 6) {
656 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
657 brw_SEL(p, dst, arg0, arg1);
658 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
659 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
660 } else {
661 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
662 brw_SEL(p, dst, arg0, arg1);
663 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
664 }
665 }
666
667 static void emit_arl(struct brw_compile *p,
668 struct brw_reg dst,
669 struct brw_reg src)
670 {
671 struct intel_context *intel = &p->brw->intel;
672
673 if (intel->gen >= 6) {
674 struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
675
676 brw_RNDD(p, dst_f, src);
677 brw_MOV(p, dst, dst_f);
678 } else {
679 brw_RNDD(p, dst, src);
680 }
681 }
682
683 static void emit_math1_gen4(struct brw_vs_compile *c,
684 GLuint function,
685 struct brw_reg dst,
686 struct brw_reg arg0,
687 GLuint precision)
688 {
689 /* There are various odd behaviours with SEND on the simulator. In
690 * addition there are documented issues with the fact that the GEN4
691 * processor doesn't do dependency control properly on SEND
692 * results. So, on balance, this kludge to get around failures
693 * with writemasked math results looks like it might be necessary
694 * whether that turns out to be a simulator bug or not:
695 */
696 struct brw_compile *p = &c->func;
697 struct brw_reg tmp = dst;
698 GLboolean need_tmp = GL_FALSE;
699
700 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
701 dst.dw1.bits.writemask != 0xf)
702 need_tmp = GL_TRUE;
703
704 if (need_tmp)
705 tmp = get_tmp(c);
706
707 brw_math(p,
708 tmp,
709 function,
710 BRW_MATH_SATURATE_NONE,
711 2,
712 arg0,
713 BRW_MATH_DATA_SCALAR,
714 precision);
715
716 if (need_tmp) {
717 brw_MOV(p, dst, tmp);
718 release_tmp(c, tmp);
719 }
720 }
721
722 static void
723 emit_math1_gen6(struct brw_vs_compile *c,
724 GLuint function,
725 struct brw_reg dst,
726 struct brw_reg arg0,
727 GLuint precision)
728 {
729 struct brw_compile *p = &c->func;
730 struct brw_reg tmp_src, tmp_dst;
731
732 /* Something is strange on gen6 math in 16-wide mode, though the
733 * docs say it's supposed to work. Punt to using align1 mode,
734 * which doesn't do writemasking and swizzles.
735 */
736 tmp_src = get_tmp(c);
737 tmp_dst = get_tmp(c);
738
739 brw_MOV(p, tmp_src, arg0);
740
741 brw_set_access_mode(p, BRW_ALIGN_1);
742 brw_math(p,
743 tmp_dst,
744 function,
745 BRW_MATH_SATURATE_NONE,
746 2,
747 tmp_src,
748 BRW_MATH_DATA_SCALAR,
749 precision);
750 brw_set_access_mode(p, BRW_ALIGN_16);
751
752 brw_MOV(p, dst, tmp_dst);
753
754 release_tmp(c, tmp_src);
755 release_tmp(c, tmp_dst);
756 }
757
758 static void
759 emit_math1(struct brw_vs_compile *c,
760 GLuint function,
761 struct brw_reg dst,
762 struct brw_reg arg0,
763 GLuint precision)
764 {
765 struct brw_compile *p = &c->func;
766 struct intel_context *intel = &p->brw->intel;
767
768 if (intel->gen >= 6)
769 emit_math1_gen6(c, function, dst, arg0, precision);
770 else
771 emit_math1_gen4(c, function, dst, arg0, precision);
772 }
773
774 static void emit_math2_gen4( struct brw_vs_compile *c,
775 GLuint function,
776 struct brw_reg dst,
777 struct brw_reg arg0,
778 struct brw_reg arg1,
779 GLuint precision)
780 {
781 struct brw_compile *p = &c->func;
782 struct brw_reg tmp = dst;
783 GLboolean need_tmp = GL_FALSE;
784
785 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
786 dst.dw1.bits.writemask != 0xf)
787 need_tmp = GL_TRUE;
788
789 if (need_tmp)
790 tmp = get_tmp(c);
791
792 brw_MOV(p, brw_message_reg(3), arg1);
793
794 brw_math(p,
795 tmp,
796 function,
797 BRW_MATH_SATURATE_NONE,
798 2,
799 arg0,
800 BRW_MATH_DATA_SCALAR,
801 precision);
802
803 if (need_tmp) {
804 brw_MOV(p, dst, tmp);
805 release_tmp(c, tmp);
806 }
807 }
808
809 static void emit_math2_gen6( struct brw_vs_compile *c,
810 GLuint function,
811 struct brw_reg dst,
812 struct brw_reg arg0,
813 struct brw_reg arg1,
814 GLuint precision)
815 {
816 struct brw_compile *p = &c->func;
817 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
818
819 tmp_src0 = get_tmp(c);
820 tmp_src1 = get_tmp(c);
821 tmp_dst = get_tmp(c);
822
823 brw_MOV(p, tmp_src0, arg0);
824 brw_MOV(p, tmp_src1, arg1);
825
826 brw_set_access_mode(p, BRW_ALIGN_1);
827 brw_math2(p,
828 tmp_dst,
829 function,
830 tmp_src0,
831 tmp_src1);
832 brw_set_access_mode(p, BRW_ALIGN_16);
833
834 brw_MOV(p, dst, tmp_dst);
835
836 release_tmp(c, tmp_src0);
837 release_tmp(c, tmp_src1);
838 release_tmp(c, tmp_dst);
839 }
840
841 static void emit_math2( struct brw_vs_compile *c,
842 GLuint function,
843 struct brw_reg dst,
844 struct brw_reg arg0,
845 struct brw_reg arg1,
846 GLuint precision)
847 {
848 struct brw_compile *p = &c->func;
849 struct intel_context *intel = &p->brw->intel;
850
851 if (intel->gen >= 6)
852 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
853 else
854 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
855 }
856
857 static void emit_exp_noalias( struct brw_vs_compile *c,
858 struct brw_reg dst,
859 struct brw_reg arg0 )
860 {
861 struct brw_compile *p = &c->func;
862
863
864 if (dst.dw1.bits.writemask & WRITEMASK_X) {
865 struct brw_reg tmp = get_tmp(c);
866 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
867
868 /* tmp_d = floor(arg0.x) */
869 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
870
871 /* result[0] = 2.0 ^ tmp */
872
873 /* Adjust exponent for floating point:
874 * exp += 127
875 */
876 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
877
878 /* Install exponent and sign.
879 * Excess drops off the edge:
880 */
881 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
882 tmp_d, brw_imm_d(23));
883
884 release_tmp(c, tmp);
885 }
886
887 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
888 /* result[1] = arg0.x - floor(arg0.x) */
889 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
890 }
891
892 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
893 /* As with the LOG instruction, we might be better off just
894 * doing a taylor expansion here, seeing as we have to do all
895 * the prep work.
896 *
897 * If mathbox partial precision is too low, consider also:
898 * result[3] = result[0] * EXP(result[1])
899 */
900 emit_math1(c,
901 BRW_MATH_FUNCTION_EXP,
902 brw_writemask(dst, WRITEMASK_Z),
903 brw_swizzle1(arg0, 0),
904 BRW_MATH_PRECISION_FULL);
905 }
906
907 if (dst.dw1.bits.writemask & WRITEMASK_W) {
908 /* result[3] = 1.0; */
909 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
910 }
911 }
912
913
914 static void emit_log_noalias( struct brw_vs_compile *c,
915 struct brw_reg dst,
916 struct brw_reg arg0 )
917 {
918 struct brw_compile *p = &c->func;
919 struct brw_reg tmp = dst;
920 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
921 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
922 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
923 dst.file != BRW_GENERAL_REGISTER_FILE);
924
925 if (need_tmp) {
926 tmp = get_tmp(c);
927 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
928 }
929
930 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
931 * according to spec:
932 *
933 * These almost look likey they could be joined up, but not really
934 * practical:
935 *
936 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
937 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
938 */
939 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
940 brw_AND(p,
941 brw_writemask(tmp_ud, WRITEMASK_X),
942 brw_swizzle1(arg0_ud, 0),
943 brw_imm_ud((1U<<31)-1));
944
945 brw_SHR(p,
946 brw_writemask(tmp_ud, WRITEMASK_X),
947 tmp_ud,
948 brw_imm_ud(23));
949
950 brw_ADD(p,
951 brw_writemask(tmp, WRITEMASK_X),
952 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
953 brw_imm_d(-127));
954 }
955
956 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
957 brw_AND(p,
958 brw_writemask(tmp_ud, WRITEMASK_Y),
959 brw_swizzle1(arg0_ud, 0),
960 brw_imm_ud((1<<23)-1));
961
962 brw_OR(p,
963 brw_writemask(tmp_ud, WRITEMASK_Y),
964 tmp_ud,
965 brw_imm_ud(127<<23));
966 }
967
968 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
969 /* result[2] = result[0] + LOG2(result[1]); */
970
971 /* Why bother? The above is just a hint how to do this with a
972 * taylor series. Maybe we *should* use a taylor series as by
973 * the time all the above has been done it's almost certainly
974 * quicker than calling the mathbox, even with low precision.
975 *
976 * Options are:
977 * - result[0] + mathbox.LOG2(result[1])
978 * - mathbox.LOG2(arg0.x)
979 * - result[0] + inline_taylor_approx(result[1])
980 */
981 emit_math1(c,
982 BRW_MATH_FUNCTION_LOG,
983 brw_writemask(tmp, WRITEMASK_Z),
984 brw_swizzle1(tmp, 1),
985 BRW_MATH_PRECISION_FULL);
986
987 brw_ADD(p,
988 brw_writemask(tmp, WRITEMASK_Z),
989 brw_swizzle1(tmp, 2),
990 brw_swizzle1(tmp, 0));
991 }
992
993 if (dst.dw1.bits.writemask & WRITEMASK_W) {
994 /* result[3] = 1.0; */
995 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
996 }
997
998 if (need_tmp) {
999 brw_MOV(p, dst, tmp);
1000 release_tmp(c, tmp);
1001 }
1002 }
1003
1004
1005 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1006 */
1007 static void emit_dst_noalias( struct brw_vs_compile *c,
1008 struct brw_reg dst,
1009 struct brw_reg arg0,
1010 struct brw_reg arg1)
1011 {
1012 struct brw_compile *p = &c->func;
1013
1014 /* There must be a better way to do this:
1015 */
1016 if (dst.dw1.bits.writemask & WRITEMASK_X)
1017 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
1018 if (dst.dw1.bits.writemask & WRITEMASK_Y)
1019 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
1020 if (dst.dw1.bits.writemask & WRITEMASK_Z)
1021 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
1022 if (dst.dw1.bits.writemask & WRITEMASK_W)
1023 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
1024 }
1025
1026
1027 static void emit_xpd( struct brw_compile *p,
1028 struct brw_reg dst,
1029 struct brw_reg t,
1030 struct brw_reg u)
1031 {
1032 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
1033 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1034 }
1035
1036
1037 static void emit_lit_noalias( struct brw_vs_compile *c,
1038 struct brw_reg dst,
1039 struct brw_reg arg0 )
1040 {
1041 struct brw_compile *p = &c->func;
1042 struct brw_instruction *if_insn;
1043 struct brw_reg tmp = dst;
1044 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1045
1046 if (need_tmp)
1047 tmp = get_tmp(c);
1048
1049 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1050 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1051
1052 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1053 * to get all channels active inside the IF. In the clipping code
1054 * we run with NoMask, so it's not an option and we can use
1055 * BRW_EXECUTE_1 for all comparisions.
1056 */
1057 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1058 if_insn = brw_IF(p, BRW_EXECUTE_8);
1059 {
1060 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1061
1062 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1063 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1064 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1065
1066 emit_math2(c,
1067 BRW_MATH_FUNCTION_POW,
1068 brw_writemask(dst, WRITEMASK_Z),
1069 brw_swizzle1(tmp, 2),
1070 brw_swizzle1(arg0, 3),
1071 BRW_MATH_PRECISION_PARTIAL);
1072 }
1073
1074 brw_ENDIF(p, if_insn);
1075
1076 release_tmp(c, tmp);
1077 }
1078
1079 static void emit_lrp_noalias(struct brw_vs_compile *c,
1080 struct brw_reg dst,
1081 struct brw_reg arg0,
1082 struct brw_reg arg1,
1083 struct brw_reg arg2)
1084 {
1085 struct brw_compile *p = &c->func;
1086
1087 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1088 brw_MUL(p, brw_null_reg(), dst, arg2);
1089 brw_MAC(p, dst, arg0, arg1);
1090 }
1091
1092 /** 3 or 4-component vector normalization */
1093 static void emit_nrm( struct brw_vs_compile *c,
1094 struct brw_reg dst,
1095 struct brw_reg arg0,
1096 int num_comps)
1097 {
1098 struct brw_compile *p = &c->func;
1099 struct brw_reg tmp = get_tmp(c);
1100
1101 /* tmp = dot(arg0, arg0) */
1102 if (num_comps == 3)
1103 brw_DP3(p, tmp, arg0, arg0);
1104 else
1105 brw_DP4(p, tmp, arg0, arg0);
1106
1107 /* tmp = 1 / sqrt(tmp) */
1108 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1109
1110 /* dst = arg0 * tmp */
1111 brw_MUL(p, dst, arg0, tmp);
1112
1113 release_tmp(c, tmp);
1114 }
1115
1116
1117 static struct brw_reg
1118 get_constant(struct brw_vs_compile *c,
1119 const struct prog_instruction *inst,
1120 GLuint argIndex)
1121 {
1122 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1123 struct brw_compile *p = &c->func;
1124 struct brw_reg const_reg = c->current_const[argIndex].reg;
1125
1126 assert(argIndex < 3);
1127
1128 if (c->current_const[argIndex].index != src->Index) {
1129 /* Keep track of the last constant loaded in this slot, for reuse. */
1130 c->current_const[argIndex].index = src->Index;
1131
1132 #if 0
1133 printf(" fetch const[%d] for arg %d into reg %d\n",
1134 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1135 #endif
1136 /* need to fetch the constant now */
1137 brw_dp_READ_4_vs(p,
1138 const_reg, /* writeback dest */
1139 16 * src->Index, /* byte offset */
1140 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1141 );
1142 }
1143
1144 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1145 const_reg = stride(const_reg, 0, 4, 1);
1146 const_reg.subnr = 0;
1147
1148 return const_reg;
1149 }
1150
1151 static struct brw_reg
1152 get_reladdr_constant(struct brw_vs_compile *c,
1153 const struct prog_instruction *inst,
1154 GLuint argIndex)
1155 {
1156 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1157 struct brw_compile *p = &c->func;
1158 struct brw_context *brw = p->brw;
1159 struct intel_context *intel = &brw->intel;
1160 struct brw_reg const_reg = c->current_const[argIndex].reg;
1161 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1162 uint32_t offset;
1163
1164 assert(argIndex < 3);
1165
1166 /* Can't reuse a reladdr constant load. */
1167 c->current_const[argIndex].index = -1;
1168
1169 #if 0
1170 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1171 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1172 #endif
1173
1174 if (intel->gen >= 6) {
1175 offset = src->Index;
1176 } else {
1177 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1178 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1179 addr_reg = byte_addr_reg;
1180 offset = 16 * src->Index;
1181 }
1182
1183 /* fetch the first vec4 */
1184 brw_dp_READ_4_vs_relative(p,
1185 const_reg,
1186 addr_reg,
1187 offset,
1188 SURF_INDEX_VERT_CONST_BUFFER);
1189
1190 return const_reg;
1191 }
1192
1193
1194
1195 /* TODO: relative addressing!
1196 */
1197 static struct brw_reg get_reg( struct brw_vs_compile *c,
1198 gl_register_file file,
1199 GLuint index )
1200 {
1201 switch (file) {
1202 case PROGRAM_TEMPORARY:
1203 case PROGRAM_INPUT:
1204 case PROGRAM_OUTPUT:
1205 assert(c->regs[file][index].nr != 0);
1206 return c->regs[file][index];
1207 case PROGRAM_STATE_VAR:
1208 case PROGRAM_CONSTANT:
1209 case PROGRAM_UNIFORM:
1210 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1211 return c->regs[PROGRAM_STATE_VAR][index];
1212 case PROGRAM_ADDRESS:
1213 assert(index == 0);
1214 return c->regs[file][index];
1215
1216 case PROGRAM_UNDEFINED: /* undef values */
1217 return brw_null_reg();
1218
1219 case PROGRAM_LOCAL_PARAM:
1220 case PROGRAM_ENV_PARAM:
1221 case PROGRAM_WRITE_ONLY:
1222 default:
1223 assert(0);
1224 return brw_null_reg();
1225 }
1226 }
1227
1228
1229 /**
1230 * Indirect addressing: get reg[[arg] + offset].
1231 */
1232 static struct brw_reg deref( struct brw_vs_compile *c,
1233 struct brw_reg arg,
1234 GLint offset,
1235 GLuint reg_size )
1236 {
1237 struct brw_compile *p = &c->func;
1238 struct brw_reg tmp = get_tmp(c);
1239 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1240 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1241 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1242 struct brw_reg indirect = brw_vec4_indirect(0,0);
1243 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1244
1245 /* Set the vertical stride on the register access so that the first
1246 * 4 components come from a0.0 and the second 4 from a0.1.
1247 */
1248 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1249
1250 {
1251 brw_push_insn_state(p);
1252 brw_set_access_mode(p, BRW_ALIGN_1);
1253
1254 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1255 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1256
1257 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1258 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1259
1260 brw_MOV(p, tmp, indirect);
1261
1262 brw_pop_insn_state(p);
1263 }
1264
1265 /* NOTE: tmp not released */
1266 return tmp;
1267 }
1268
1269 static void
1270 move_to_reladdr_dst(struct brw_vs_compile *c,
1271 const struct prog_instruction *inst,
1272 struct brw_reg val)
1273 {
1274 struct brw_compile *p = &c->func;
1275 int reg_size = 32;
1276 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1277 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1278 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1279 GLuint byte_offset = base.nr * 32 + base.subnr;
1280 struct brw_reg indirect = brw_vec4_indirect(0,0);
1281 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1282
1283 /* Because destination register indirect addressing can only use
1284 * one index, we'll write each vertex's vec4 value separately.
1285 */
1286 val.width = BRW_WIDTH_4;
1287 val.vstride = BRW_VERTICAL_STRIDE_4;
1288
1289 brw_push_insn_state(p);
1290 brw_set_access_mode(p, BRW_ALIGN_1);
1291
1292 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1293 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1294 brw_MOV(p, indirect, val);
1295
1296 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1297 brw_ADD(p, brw_address_reg(0), acc,
1298 brw_imm_uw(byte_offset + reg_size / 2));
1299 brw_MOV(p, indirect, suboffset(val, 4));
1300
1301 brw_pop_insn_state(p);
1302 }
1303
1304 /**
1305 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1306 * TODO: relative addressing!
1307 */
1308 static struct brw_reg
1309 get_src_reg( struct brw_vs_compile *c,
1310 const struct prog_instruction *inst,
1311 GLuint argIndex )
1312 {
1313 const GLuint file = inst->SrcReg[argIndex].File;
1314 const GLint index = inst->SrcReg[argIndex].Index;
1315 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1316
1317 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1318 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1319
1320 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1321 SWIZZLE_ZERO,
1322 SWIZZLE_ZERO,
1323 SWIZZLE_ZERO)) {
1324 return brw_imm_f(0.0f);
1325 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1326 SWIZZLE_ONE,
1327 SWIZZLE_ONE,
1328 SWIZZLE_ONE)) {
1329 if (src->Negate)
1330 return brw_imm_f(-1.0F);
1331 else
1332 return brw_imm_f(1.0F);
1333 } else if (src->File == PROGRAM_CONSTANT) {
1334 const struct gl_program_parameter_list *params;
1335 float f;
1336 int component = -1;
1337
1338 switch (src->Swizzle) {
1339 case SWIZZLE_XXXX:
1340 component = 0;
1341 break;
1342 case SWIZZLE_YYYY:
1343 component = 1;
1344 break;
1345 case SWIZZLE_ZZZZ:
1346 component = 2;
1347 break;
1348 case SWIZZLE_WWWW:
1349 component = 3;
1350 break;
1351 }
1352
1353 if (component >= 0) {
1354 params = c->vp->program.Base.Parameters;
1355 f = params->ParameterValues[src->Index][component];
1356
1357 if (src->Abs)
1358 f = fabs(f);
1359 if (src->Negate)
1360 f = -f;
1361 return brw_imm_f(f);
1362 }
1363 }
1364 }
1365
1366 switch (file) {
1367 case PROGRAM_TEMPORARY:
1368 case PROGRAM_INPUT:
1369 case PROGRAM_OUTPUT:
1370 if (relAddr) {
1371 return deref(c, c->regs[file][0], index, 32);
1372 }
1373 else {
1374 assert(c->regs[file][index].nr != 0);
1375 return c->regs[file][index];
1376 }
1377
1378 case PROGRAM_STATE_VAR:
1379 case PROGRAM_CONSTANT:
1380 case PROGRAM_UNIFORM:
1381 case PROGRAM_ENV_PARAM:
1382 case PROGRAM_LOCAL_PARAM:
1383 if (!relAddr && c->constant_map[index] != -1) {
1384 /* Take from the push constant buffer if possible. */
1385 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1386 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1387 } else {
1388 /* Must be in the pull constant buffer then .*/
1389 assert(c->vp->use_const_buffer);
1390 if (relAddr)
1391 return get_reladdr_constant(c, inst, argIndex);
1392 else
1393 return get_constant(c, inst, argIndex);
1394 }
1395 case PROGRAM_ADDRESS:
1396 assert(index == 0);
1397 return c->regs[file][index];
1398
1399 case PROGRAM_UNDEFINED:
1400 /* this is a normal case since we loop over all three src args */
1401 return brw_null_reg();
1402
1403 case PROGRAM_WRITE_ONLY:
1404 default:
1405 assert(0);
1406 return brw_null_reg();
1407 }
1408 }
1409
1410 /**
1411 * Return the brw reg for the given instruction's src argument.
1412 * Will return mangled results for SWZ op. The emit_swz() function
1413 * ignores this result and recalculates taking extended swizzles into
1414 * account.
1415 */
1416 static struct brw_reg get_arg( struct brw_vs_compile *c,
1417 const struct prog_instruction *inst,
1418 GLuint argIndex )
1419 {
1420 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1421 struct brw_reg reg;
1422
1423 if (src->File == PROGRAM_UNDEFINED)
1424 return brw_null_reg();
1425
1426 reg = get_src_reg(c, inst, argIndex);
1427
1428 /* Convert 3-bit swizzle to 2-bit.
1429 */
1430 if (reg.file != BRW_IMMEDIATE_VALUE) {
1431 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1432 GET_SWZ(src->Swizzle, 1),
1433 GET_SWZ(src->Swizzle, 2),
1434 GET_SWZ(src->Swizzle, 3));
1435
1436 /* Note this is ok for non-swizzle ARB_vp instructions */
1437 reg.negate = src->Negate ? 1 : 0;
1438 }
1439
1440 return reg;
1441 }
1442
1443
1444 /**
1445 * Get brw register for the given program dest register.
1446 */
1447 static struct brw_reg get_dst( struct brw_vs_compile *c,
1448 struct prog_dst_register dst )
1449 {
1450 struct brw_reg reg;
1451
1452 switch (dst.File) {
1453 case PROGRAM_TEMPORARY:
1454 case PROGRAM_OUTPUT:
1455 /* register-indirect addressing is only 1x1, not VxH, for
1456 * destination regs. So, for RelAddr we'll return a temporary
1457 * for the dest and do a move of the result to the RelAddr
1458 * register after the instruction emit.
1459 */
1460 if (dst.RelAddr) {
1461 reg = get_tmp(c);
1462 } else {
1463 assert(c->regs[dst.File][dst.Index].nr != 0);
1464 reg = c->regs[dst.File][dst.Index];
1465 }
1466 break;
1467 case PROGRAM_ADDRESS:
1468 assert(dst.Index == 0);
1469 reg = c->regs[dst.File][dst.Index];
1470 break;
1471 case PROGRAM_UNDEFINED:
1472 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1473 reg = brw_null_reg();
1474 break;
1475 default:
1476 assert(0);
1477 reg = brw_null_reg();
1478 }
1479
1480 assert(reg.type != BRW_IMMEDIATE_VALUE);
1481 reg.dw1.bits.writemask = dst.WriteMask;
1482
1483 return reg;
1484 }
1485
1486
1487 static void emit_swz( struct brw_vs_compile *c,
1488 struct brw_reg dst,
1489 const struct prog_instruction *inst)
1490 {
1491 const GLuint argIndex = 0;
1492 const struct prog_src_register src = inst->SrcReg[argIndex];
1493 struct brw_compile *p = &c->func;
1494 GLuint zeros_mask = 0;
1495 GLuint ones_mask = 0;
1496 GLuint src_mask = 0;
1497 GLubyte src_swz[4];
1498 GLboolean need_tmp = (src.Negate &&
1499 dst.file != BRW_GENERAL_REGISTER_FILE);
1500 struct brw_reg tmp = dst;
1501 GLuint i;
1502
1503 if (need_tmp)
1504 tmp = get_tmp(c);
1505
1506 for (i = 0; i < 4; i++) {
1507 if (dst.dw1.bits.writemask & (1<<i)) {
1508 GLubyte s = GET_SWZ(src.Swizzle, i);
1509 switch (s) {
1510 case SWIZZLE_X:
1511 case SWIZZLE_Y:
1512 case SWIZZLE_Z:
1513 case SWIZZLE_W:
1514 src_mask |= 1<<i;
1515 src_swz[i] = s;
1516 break;
1517 case SWIZZLE_ZERO:
1518 zeros_mask |= 1<<i;
1519 break;
1520 case SWIZZLE_ONE:
1521 ones_mask |= 1<<i;
1522 break;
1523 }
1524 }
1525 }
1526
1527 /* Do src first, in case dst aliases src:
1528 */
1529 if (src_mask) {
1530 struct brw_reg arg0;
1531
1532 arg0 = get_src_reg(c, inst, argIndex);
1533
1534 arg0 = brw_swizzle(arg0,
1535 src_swz[0], src_swz[1],
1536 src_swz[2], src_swz[3]);
1537
1538 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1539 }
1540
1541 if (zeros_mask)
1542 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1543
1544 if (ones_mask)
1545 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1546
1547 if (src.Negate)
1548 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1549
1550 if (need_tmp) {
1551 brw_MOV(p, dst, tmp);
1552 release_tmp(c, tmp);
1553 }
1554 }
1555
1556 static int
1557 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1558 {
1559 struct intel_context *intel = &brw->intel;
1560
1561 if (intel->gen >= 6) {
1562 /* URB data written (does not include the message header reg) must
1563 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1564 * section 5.4.3.2.2: URB_INTERLEAVED.
1565 *
1566 * URB entries are allocated on a multiple of 1024 bits, so an
1567 * extra 128 bits written here to make the end align to 256 is
1568 * no problem.
1569 */
1570 if ((mlen % 2) != 1)
1571 mlen++;
1572 }
1573
1574 return mlen;
1575 }
1576
1577 /**
1578 * Post-vertex-program processing. Send the results to the URB.
1579 */
1580 static void emit_vertex_write( struct brw_vs_compile *c)
1581 {
1582 struct brw_compile *p = &c->func;
1583 struct brw_context *brw = p->brw;
1584 struct intel_context *intel = &brw->intel;
1585 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1586 struct brw_reg ndc;
1587 int eot;
1588 GLuint len_vertex_header = 2;
1589 int next_mrf, i;
1590 int msg_len;
1591
1592 if (c->key.copy_edgeflag) {
1593 brw_MOV(p,
1594 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1595 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1596 }
1597
1598 if (intel->gen < 6) {
1599 /* Build ndc coords */
1600 ndc = get_tmp(c);
1601 /* ndc = 1.0 / pos.w */
1602 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1603 /* ndc.xyz = pos * ndc */
1604 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1605 }
1606
1607 /* Update the header for point size, user clipping flags, and -ve rhw
1608 * workaround.
1609 */
1610 if (intel->gen >= 6) {
1611 struct brw_reg m1 = brw_message_reg(1);
1612
1613 /* On gen6, m1 has each value in a separate dword, so we never
1614 * need to mess with a temporary for computing the m1 value.
1615 */
1616 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1617 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1618 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1619 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1620 }
1621
1622 /* Set the user clip distances in dword 8-15. (m3-4)*/
1623 if (c->key.nr_userclip) {
1624 for (i = 0; i < c->key.nr_userclip; i++) {
1625 struct brw_reg m;
1626 if (i < 4)
1627 m = brw_message_reg(3);
1628 else
1629 m = brw_message_reg(4);
1630
1631 brw_DP4(p, brw_writemask(m, (1 << (i & 7))),pos, c->userplane[i]);
1632 }
1633 }
1634 } else if ((c->prog_data.outputs_written &
1635 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1636 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1637 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1638 GLuint i;
1639
1640 brw_MOV(p, header1, brw_imm_ud(0));
1641
1642 brw_set_access_mode(p, BRW_ALIGN_16);
1643
1644 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1645 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1646 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1647 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1648 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1649 header1, brw_imm_ud(0x7ff<<8));
1650 }
1651
1652 for (i = 0; i < c->key.nr_userclip; i++) {
1653 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1654 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1655 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1656 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1657 }
1658
1659 /* i965 clipping workaround:
1660 * 1) Test for -ve rhw
1661 * 2) If set,
1662 * set ndc = (0,0,0,0)
1663 * set ucp[6] = 1
1664 *
1665 * Later, clipping will detect ucp[6] and ensure the primitive is
1666 * clipped against all fixed planes.
1667 */
1668 if (brw->has_negative_rhw_bug) {
1669 brw_CMP(p,
1670 vec8(brw_null_reg()),
1671 BRW_CONDITIONAL_L,
1672 brw_swizzle1(ndc, 3),
1673 brw_imm_f(0));
1674
1675 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1676 brw_MOV(p, ndc, brw_imm_f(0));
1677 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1678 }
1679
1680 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1681 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1682 brw_set_access_mode(p, BRW_ALIGN_16);
1683
1684 release_tmp(c, header1);
1685 }
1686 else {
1687 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1688 }
1689
1690 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1691 * of zeros followed by two sets of NDC coordinates:
1692 */
1693 brw_set_access_mode(p, BRW_ALIGN_1);
1694 brw_set_acc_write_control(p, 0);
1695
1696 /* The VUE layout is documented in Volume 2a. */
1697 if (intel->gen >= 6) {
1698 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1699 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1700 * dword 4-7 (m2) is the 4D space position
1701 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1702 * enabled.
1703 * m3 or 5 is the first vertex element data we fill, which is
1704 * the vertex position.
1705 */
1706 brw_MOV(p, brw_message_reg(2), pos);
1707 len_vertex_header = 1;
1708 if (c->key.nr_userclip > 0)
1709 len_vertex_header += 2;
1710 } else if (intel->gen == 5) {
1711 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1712 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1713 * dword 4-7 (m2) is the ndc position (set above)
1714 * dword 8-11 (m3) of the vertex header is the 4D space position
1715 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1716 * m6 is a pad so that the vertex element data is aligned
1717 * m7 is the first vertex data we fill, which is the vertex position.
1718 */
1719 brw_MOV(p, brw_message_reg(2), ndc);
1720 brw_MOV(p, brw_message_reg(3), pos);
1721 brw_MOV(p, brw_message_reg(7), pos);
1722 len_vertex_header = 6;
1723 } else {
1724 /* There are 8 dwords in VUE header pre-Ironlake:
1725 * dword 0-3 (m1) is indices, point width, clip flags.
1726 * dword 4-7 (m2) is ndc position (set above)
1727 *
1728 * dword 8-11 (m3) is the first vertex data, which we always have be the
1729 * vertex position.
1730 */
1731 brw_MOV(p, brw_message_reg(2), ndc);
1732 brw_MOV(p, brw_message_reg(3), pos);
1733 len_vertex_header = 2;
1734 }
1735
1736 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1737 next_mrf = 2 + len_vertex_header;
1738 for (i = 0; i < VERT_RESULT_MAX; i++) {
1739 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1740 break;
1741 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1742 continue;
1743 if (i == VERT_RESULT_PSIZ)
1744 continue;
1745
1746 if (i >= VERT_RESULT_TEX0 &&
1747 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1748 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1749 next_mrf++;
1750 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1751 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1752 }
1753 }
1754
1755 eot = (c->first_overflow_output == 0);
1756
1757 /* Message header, plus VUE header, plus the (first set of) outputs. */
1758 msg_len = 1 + len_vertex_header + c->nr_outputs;
1759 msg_len = align_interleaved_urb_mlen(brw, msg_len);
1760 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1761 msg_len = MIN2(msg_len, (BRW_MAX_MRF - 1)),
1762
1763 brw_urb_WRITE(p,
1764 brw_null_reg(), /* dest */
1765 0, /* starting mrf reg nr */
1766 c->r0, /* src */
1767 0, /* allocate */
1768 1, /* used */
1769 msg_len,
1770 0, /* response len */
1771 eot, /* eot */
1772 eot, /* writes complete */
1773 0, /* urb destination offset */
1774 BRW_URB_SWIZZLE_INTERLEAVE);
1775
1776 if (c->first_overflow_output > 0) {
1777 /* Not all of the vertex outputs/results fit into the MRF.
1778 * Move the overflowed attributes from the GRF to the MRF and
1779 * issue another brw_urb_WRITE().
1780 */
1781 GLuint i, mrf = 1;
1782 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1783 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1784 /* move from GRF to MRF */
1785 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1786 mrf++;
1787 }
1788 }
1789
1790 brw_urb_WRITE(p,
1791 brw_null_reg(), /* dest */
1792 0, /* starting mrf reg nr */
1793 c->r0, /* src */
1794 0, /* allocate */
1795 1, /* used */
1796 align_interleaved_urb_mlen(brw, mrf),
1797 0, /* response len */
1798 1, /* eot */
1799 1, /* writes complete */
1800 14 / 2, /* urb destination offset */
1801 BRW_URB_SWIZZLE_INTERLEAVE);
1802 }
1803 }
1804
1805 static GLboolean
1806 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1807 {
1808 struct brw_compile *p = &c->func;
1809 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1810
1811 if (p->nr_insn == 0)
1812 return GL_FALSE;
1813
1814 if (val.address_mode != BRW_ADDRESS_DIRECT)
1815 return GL_FALSE;
1816
1817 switch (prev_insn->header.opcode) {
1818 case BRW_OPCODE_MOV:
1819 case BRW_OPCODE_MAC:
1820 case BRW_OPCODE_MUL:
1821 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1822 prev_insn->header.execution_size == val.width &&
1823 prev_insn->bits1.da1.dest_reg_file == val.file &&
1824 prev_insn->bits1.da1.dest_reg_type == val.type &&
1825 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1826 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1827 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1828 prev_insn->bits1.da16.dest_writemask == 0xf)
1829 return GL_TRUE;
1830 else
1831 return GL_FALSE;
1832 default:
1833 return GL_FALSE;
1834 }
1835 }
1836
1837 static uint32_t
1838 get_predicate(const struct prog_instruction *inst)
1839 {
1840 if (inst->DstReg.CondMask == COND_TR)
1841 return BRW_PREDICATE_NONE;
1842
1843 /* All of GLSL only produces predicates for COND_NE and one channel per
1844 * vector. Fail badly if someone starts doing something else, as it might
1845 * mean infinite looping or something.
1846 *
1847 * We'd like to support all the condition codes, but our hardware doesn't
1848 * quite match the Mesa IR, which is modeled after the NV extensions. For
1849 * those, the instruction may update the condition codes or not, then any
1850 * later instruction may use one of those condition codes. For gen4, the
1851 * instruction may update the flags register based on one of the condition
1852 * codes output by the instruction, and then further instructions may
1853 * predicate on that. We can probably support this, but it won't
1854 * necessarily be easy.
1855 */
1856 assert(inst->DstReg.CondMask == COND_NE);
1857
1858 switch (inst->DstReg.CondSwizzle) {
1859 case SWIZZLE_XXXX:
1860 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1861 case SWIZZLE_YYYY:
1862 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1863 case SWIZZLE_ZZZZ:
1864 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1865 case SWIZZLE_WWWW:
1866 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1867 default:
1868 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1869 inst->DstReg.CondMask);
1870 return BRW_PREDICATE_NORMAL;
1871 }
1872 }
1873
1874 /* Emit the vertex program instructions here.
1875 */
1876 void brw_vs_emit(struct brw_vs_compile *c )
1877 {
1878 #define MAX_IF_DEPTH 32
1879 #define MAX_LOOP_DEPTH 32
1880 struct brw_compile *p = &c->func;
1881 struct brw_context *brw = p->brw;
1882 struct intel_context *intel = &brw->intel;
1883 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1884 GLuint insn, if_depth = 0, loop_depth = 0;
1885 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1886 int if_depth_in_loop[MAX_LOOP_DEPTH];
1887 const struct brw_indirect stack_index = brw_indirect(0, 0);
1888 GLuint index;
1889 GLuint file;
1890
1891 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1892 printf("vs-mesa:\n");
1893 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1894 GL_TRUE);
1895 printf("\n");
1896 }
1897
1898 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1899 brw_set_access_mode(p, BRW_ALIGN_16);
1900 if_depth_in_loop[loop_depth] = 0;
1901
1902 brw_set_acc_write_control(p, 1);
1903
1904 for (insn = 0; insn < nr_insns; insn++) {
1905 GLuint i;
1906 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1907
1908 /* Message registers can't be read, so copy the output into GRF
1909 * register if they are used in source registers
1910 */
1911 for (i = 0; i < 3; i++) {
1912 struct prog_src_register *src = &inst->SrcReg[i];
1913 GLuint index = src->Index;
1914 GLuint file = src->File;
1915 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1916 c->output_regs[index].used_in_src = GL_TRUE;
1917 }
1918
1919 switch (inst->Opcode) {
1920 case OPCODE_CAL:
1921 case OPCODE_RET:
1922 c->needs_stack = GL_TRUE;
1923 break;
1924 default:
1925 break;
1926 }
1927 }
1928
1929 /* Static register allocation
1930 */
1931 brw_vs_alloc_regs(c);
1932
1933 if (c->needs_stack)
1934 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1935
1936 for (insn = 0; insn < nr_insns; insn++) {
1937
1938 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1939 struct brw_reg args[3], dst;
1940 GLuint i;
1941
1942 #if 0
1943 printf("%d: ", insn);
1944 _mesa_print_instruction(inst);
1945 #endif
1946
1947 /* Get argument regs. SWZ is special and does this itself.
1948 */
1949 if (inst->Opcode != OPCODE_SWZ)
1950 for (i = 0; i < 3; i++) {
1951 const struct prog_src_register *src = &inst->SrcReg[i];
1952 index = src->Index;
1953 file = src->File;
1954 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1955 args[i] = c->output_regs[index].reg;
1956 else
1957 args[i] = get_arg(c, inst, i);
1958 }
1959
1960 /* Get dest regs. Note that it is possible for a reg to be both
1961 * dst and arg, given the static allocation of registers. So
1962 * care needs to be taken emitting multi-operation instructions.
1963 */
1964 index = inst->DstReg.Index;
1965 file = inst->DstReg.File;
1966 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1967 dst = c->output_regs[index].reg;
1968 else
1969 dst = get_dst(c, inst->DstReg);
1970
1971 if (inst->SaturateMode != SATURATE_OFF) {
1972 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1973 inst->SaturateMode);
1974 }
1975
1976 switch (inst->Opcode) {
1977 case OPCODE_ABS:
1978 args[0].negate = false;
1979 brw_MOV(p, dst, brw_abs(args[0]));
1980 break;
1981 case OPCODE_ADD:
1982 brw_ADD(p, dst, args[0], args[1]);
1983 break;
1984 case OPCODE_COS:
1985 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1986 break;
1987 case OPCODE_DP2:
1988 brw_DP2(p, dst, args[0], args[1]);
1989 break;
1990 case OPCODE_DP3:
1991 brw_DP3(p, dst, args[0], args[1]);
1992 break;
1993 case OPCODE_DP4:
1994 brw_DP4(p, dst, args[0], args[1]);
1995 break;
1996 case OPCODE_DPH:
1997 brw_DPH(p, dst, args[0], args[1]);
1998 break;
1999 case OPCODE_NRM3:
2000 emit_nrm(c, dst, args[0], 3);
2001 break;
2002 case OPCODE_NRM4:
2003 emit_nrm(c, dst, args[0], 4);
2004 break;
2005 case OPCODE_DST:
2006 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
2007 break;
2008 case OPCODE_EXP:
2009 unalias1(c, dst, args[0], emit_exp_noalias);
2010 break;
2011 case OPCODE_EX2:
2012 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
2013 break;
2014 case OPCODE_ARL:
2015 emit_arl(p, dst, args[0]);
2016 break;
2017 case OPCODE_FLR:
2018 brw_RNDD(p, dst, args[0]);
2019 break;
2020 case OPCODE_FRC:
2021 brw_FRC(p, dst, args[0]);
2022 break;
2023 case OPCODE_LOG:
2024 unalias1(c, dst, args[0], emit_log_noalias);
2025 break;
2026 case OPCODE_LG2:
2027 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2028 break;
2029 case OPCODE_LIT:
2030 unalias1(c, dst, args[0], emit_lit_noalias);
2031 break;
2032 case OPCODE_LRP:
2033 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2034 break;
2035 case OPCODE_MAD:
2036 if (!accumulator_contains(c, args[2]))
2037 brw_MOV(p, brw_acc_reg(), args[2]);
2038 brw_MAC(p, dst, args[0], args[1]);
2039 break;
2040 case OPCODE_CMP:
2041 emit_cmp(p, dst, args[0], args[1], args[2]);
2042 break;
2043 case OPCODE_MAX:
2044 emit_max(p, dst, args[0], args[1]);
2045 break;
2046 case OPCODE_MIN:
2047 emit_min(p, dst, args[0], args[1]);
2048 break;
2049 case OPCODE_MOV:
2050 brw_MOV(p, dst, args[0]);
2051 break;
2052 case OPCODE_MUL:
2053 brw_MUL(p, dst, args[0], args[1]);
2054 break;
2055 case OPCODE_POW:
2056 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2057 break;
2058 case OPCODE_RCP:
2059 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2060 break;
2061 case OPCODE_RSQ:
2062 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2063 break;
2064
2065 case OPCODE_SEQ:
2066 unalias2(c, dst, args[0], args[1], emit_seq);
2067 break;
2068 case OPCODE_SIN:
2069 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2070 break;
2071 case OPCODE_SNE:
2072 unalias2(c, dst, args[0], args[1], emit_sne);
2073 break;
2074 case OPCODE_SGE:
2075 unalias2(c, dst, args[0], args[1], emit_sge);
2076 break;
2077 case OPCODE_SGT:
2078 unalias2(c, dst, args[0], args[1], emit_sgt);
2079 break;
2080 case OPCODE_SLT:
2081 unalias2(c, dst, args[0], args[1], emit_slt);
2082 break;
2083 case OPCODE_SLE:
2084 unalias2(c, dst, args[0], args[1], emit_sle);
2085 break;
2086 case OPCODE_SSG:
2087 unalias1(c, dst, args[0], emit_sign);
2088 break;
2089 case OPCODE_SUB:
2090 brw_ADD(p, dst, args[0], negate(args[1]));
2091 break;
2092 case OPCODE_SWZ:
2093 /* The args[0] value can't be used here as it won't have
2094 * correctly encoded the full swizzle:
2095 */
2096 emit_swz(c, dst, inst);
2097 break;
2098 case OPCODE_TRUNC:
2099 /* round toward zero */
2100 brw_RNDZ(p, dst, args[0]);
2101 break;
2102 case OPCODE_XPD:
2103 emit_xpd(p, dst, args[0], args[1]);
2104 break;
2105 case OPCODE_IF:
2106 assert(if_depth < MAX_IF_DEPTH);
2107 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
2108 /* Note that brw_IF smashes the predicate_control field. */
2109 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
2110 if_depth_in_loop[loop_depth]++;
2111 if_depth++;
2112 break;
2113 case OPCODE_ELSE:
2114 clear_current_const(c);
2115 assert(if_depth > 0);
2116 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2117 break;
2118 case OPCODE_ENDIF:
2119 clear_current_const(c);
2120 assert(if_depth > 0);
2121 brw_ENDIF(p, if_inst[--if_depth]);
2122 if_depth_in_loop[loop_depth]--;
2123 break;
2124 case OPCODE_BGNLOOP:
2125 clear_current_const(c);
2126 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2127 if_depth_in_loop[loop_depth] = 0;
2128 break;
2129 case OPCODE_BRK:
2130 brw_set_predicate_control(p, get_predicate(inst));
2131 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2132 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2133 break;
2134 case OPCODE_CONT:
2135 brw_set_predicate_control(p, get_predicate(inst));
2136 if (intel->gen >= 6) {
2137 gen6_CONT(p, loop_inst[loop_depth - 1]);
2138 } else {
2139 brw_CONT(p, if_depth_in_loop[loop_depth]);
2140 }
2141 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2142 break;
2143
2144 case OPCODE_ENDLOOP: {
2145 clear_current_const(c);
2146 struct brw_instruction *inst0, *inst1;
2147 GLuint br = 1;
2148
2149 loop_depth--;
2150
2151 if (intel->gen == 5)
2152 br = 2;
2153
2154 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2155
2156 if (intel->gen < 6) {
2157 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2158 while (inst0 > loop_inst[loop_depth]) {
2159 inst0--;
2160 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2161 inst0->bits3.if_else.jump_count == 0) {
2162 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2163 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2164 inst0->bits3.if_else.jump_count == 0) {
2165 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2166 }
2167 }
2168 }
2169 }
2170 break;
2171
2172 case OPCODE_BRA:
2173 brw_set_predicate_control(p, get_predicate(inst));
2174 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2175 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2176 break;
2177 case OPCODE_CAL:
2178 brw_set_access_mode(p, BRW_ALIGN_1);
2179 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2180 brw_set_access_mode(p, BRW_ALIGN_16);
2181 brw_ADD(p, get_addr_reg(stack_index),
2182 get_addr_reg(stack_index), brw_imm_d(4));
2183 brw_save_call(p, inst->Comment, p->nr_insn);
2184 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2185 break;
2186 case OPCODE_RET:
2187 brw_ADD(p, get_addr_reg(stack_index),
2188 get_addr_reg(stack_index), brw_imm_d(-4));
2189 brw_set_access_mode(p, BRW_ALIGN_1);
2190 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2191 brw_set_access_mode(p, BRW_ALIGN_16);
2192 break;
2193 case OPCODE_END:
2194 emit_vertex_write(c);
2195 break;
2196 case OPCODE_PRINT:
2197 /* no-op */
2198 break;
2199 case OPCODE_BGNSUB:
2200 brw_save_label(p, inst->Comment, p->nr_insn);
2201 break;
2202 case OPCODE_ENDSUB:
2203 /* no-op */
2204 break;
2205 default:
2206 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2207 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2208 _mesa_opcode_string(inst->Opcode) :
2209 "unknown");
2210 }
2211
2212 /* Set the predication update on the last instruction of the native
2213 * instruction sequence.
2214 *
2215 * This would be problematic if it was set on a math instruction,
2216 * but that shouldn't be the case with the current GLSL compiler.
2217 */
2218 if (inst->CondUpdate) {
2219 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2220
2221 assert(hw_insn->header.destreg__conditionalmod == 0);
2222 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2223 }
2224
2225 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2226 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2227 && c->output_regs[inst->DstReg.Index].used_in_src) {
2228 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2229 }
2230
2231 /* Result color clamping.
2232 *
2233 * When destination register is an output register and
2234 * it's primary/secondary front/back color, we have to clamp
2235 * the result to [0,1]. This is done by enabling the
2236 * saturation bit for the last instruction.
2237 *
2238 * We don't use brw_set_saturate() as it modifies
2239 * p->current->header.saturate, which affects all the subsequent
2240 * instructions. Instead, we directly modify the header
2241 * of the last (already stored) instruction.
2242 */
2243 if (inst->DstReg.File == PROGRAM_OUTPUT &&
2244 c->key.clamp_vertex_color) {
2245 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2246 || (inst->DstReg.Index == VERT_RESULT_COL1)
2247 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2248 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2249 p->store[p->nr_insn-1].header.saturate = 1;
2250 }
2251 }
2252
2253 if (inst->DstReg.RelAddr) {
2254 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2255 inst->DstReg.File == PROGRAM_OUTPUT);
2256 move_to_reladdr_dst(c, inst, dst);
2257 }
2258
2259 release_tmps(c);
2260 }
2261
2262 brw_resolve_cals(p);
2263 brw_set_uip_jip(p);
2264
2265 brw_optimize(p);
2266
2267 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2268 int i;
2269
2270 printf("vs-native:\n");
2271 for (i = 0; i < p->nr_insn; i++)
2272 brw_disasm(stdout, &p->store[i], intel->gen);
2273 printf("\n");
2274 }
2275 }