Merge branch 'glsl-to-tgsi'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf, j;
144 int attributes_in_vue;
145 int first_reladdr_output;
146 int max_constant;
147 int constant = 0;
148 int vert_result_reoder[VERT_RESULT_MAX];
149 int bfc = 0;
150
151 /* Determine whether to use a real constant buffer or use a block
152 * of GRF registers for constants. The later is faster but only
153 * works if everything fits in the GRF.
154 * XXX this heuristic/check may need some fine tuning...
155 */
156 if (c->vp->program.Base.Parameters->NumParameters +
157 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
158 c->vp->use_const_buffer = GL_TRUE;
159 else
160 c->vp->use_const_buffer = GL_FALSE;
161
162 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
163
164 /* r0 -- reserved as usual
165 */
166 c->r0 = brw_vec8_grf(reg, 0);
167 reg++;
168
169 /* User clip planes from curbe:
170 */
171 if (c->key.nr_userclip) {
172 if (intel->gen >= 6) {
173 for (i = 0; i < c->key.nr_userclip; i++) {
174 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
175 (i % 2) * 4), 0, 4, 1);
176 }
177 reg += ALIGN(c->key.nr_userclip, 2) / 2;
178 } else {
179 for (i = 0; i < c->key.nr_userclip; i++) {
180 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
181 (i % 2) * 4), 0, 4, 1);
182 }
183 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
184 }
185
186 }
187
188 /* Assign some (probably all) of the vertex program constants to
189 * the push constant buffer/CURBE.
190 *
191 * There's an obvious limit to the numer of push constants equal to
192 * the number of register available, and that number is smaller
193 * than the minimum maximum number of vertex program parameters, so
194 * support for pull constants is required if we overflow.
195 * Additionally, on gen6 the number of push constants is even
196 * lower.
197 *
198 * When there's relative addressing, we don't know what range of
199 * Mesa IR registers can be accessed. And generally, when relative
200 * addressing is used we also have too many constants to load them
201 * all as push constants. So, we'll just support relative
202 * addressing out of the pull constant buffers, and try to load as
203 * many statically-accessed constants into the push constant buffer
204 * as we can.
205 */
206 if (intel->gen >= 6) {
207 /* We can only load 32 regs of push constants. */
208 max_constant = 32 * 2 - c->key.nr_userclip;
209 } else {
210 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
211 }
212
213 /* constant_map maps from ParameterValues[] index to index in the
214 * push constant buffer, or -1 if it's only in the pull constant
215 * buffer.
216 */
217 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
218 for (i = 0;
219 i < c->vp->program.Base.NumInstructions && constant < max_constant;
220 i++) {
221 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
222 int arg;
223
224 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
225 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
226 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
227 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
228 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
229 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
230 continue;
231 }
232
233 if (inst->SrcReg[arg].RelAddr) {
234 c->vp->use_const_buffer = GL_TRUE;
235 continue;
236 }
237
238 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
239 c->constant_map[inst->SrcReg[arg].Index] = constant++;
240 }
241 }
242 }
243
244 /* If we ran out of push constant space, then we'll also upload all
245 * constants through the pull constant buffer so that they can be
246 * accessed no matter what. For relative addressing (the common
247 * case) we need them all in place anyway.
248 */
249 if (constant == max_constant)
250 c->vp->use_const_buffer = GL_TRUE;
251
252 for (i = 0; i < constant; i++) {
253 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
254 (i % 2) * 4),
255 0, 4, 1);
256 }
257 reg += (constant + 1) / 2;
258 c->prog_data.curb_read_length = reg - 1;
259 c->prog_data.nr_params = constant * 4;
260 /* XXX 0 causes a bug elsewhere... */
261 if (intel->gen < 6 && c->prog_data.nr_params == 0)
262 c->prog_data.nr_params = 4;
263
264 /* Allocate input regs:
265 */
266 c->nr_inputs = 0;
267 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
268 if (c->prog_data.inputs_read & (1 << i)) {
269 c->nr_inputs++;
270 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
271 reg++;
272 }
273 }
274 /* If there are no inputs, we'll still be reading one attribute's worth
275 * because it's required -- see urb_read_length setting.
276 */
277 if (c->nr_inputs == 0)
278 reg++;
279
280 /* Allocate outputs. The non-position outputs go straight into message regs.
281 */
282 c->nr_outputs = 0;
283 c->first_output = reg;
284 c->first_overflow_output = 0;
285
286 if (intel->gen >= 6) {
287 mrf = 3;
288 if (c->key.nr_userclip)
289 mrf += 2;
290 } else if (intel->gen == 5)
291 mrf = 8;
292 else
293 mrf = 4;
294
295 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
296
297 for (i = 0; i < VERT_RESULT_MAX; i++)
298 vert_result_reoder[i] = i;
299
300 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
301 if (intel->gen >= 6 && c->key.two_side_color) {
302 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
303 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
304 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
305 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
306 bfc = 2;
307 } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
308 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
309 bfc = 1;
310
311 if (bfc) {
312 for (i = 0; i < bfc; i++) {
313 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
314 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
315 }
316
317 for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
318 vert_result_reoder[i] = i - bfc;
319 }
320 }
321 }
322
323 for (j = 0; j < VERT_RESULT_MAX; j++) {
324 i = vert_result_reoder[j];
325
326 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
327 c->nr_outputs++;
328 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
329 if (i == VERT_RESULT_HPOS) {
330 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
331 reg++;
332 }
333 else if (i == VERT_RESULT_PSIZ) {
334 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
335 reg++;
336 }
337 else {
338 /* Two restrictions on our compute-to-MRF here. The
339 * message length for all SEND messages is restricted to
340 * [1,15], so we can't use mrf 15, as that means a length
341 * of 16.
342 *
343 * Additionally, URB writes are aligned to URB rows, so we
344 * need to put an even number of registers of URB data in
345 * each URB write so that the later write is aligned. A
346 * message length of 15 means 1 message header reg plus 14
347 * regs of URB data.
348 *
349 * For attributes beyond the compute-to-MRF, we compute to
350 * GRFs and they will be written in the second URB_WRITE.
351 */
352 if (first_reladdr_output > i && mrf < 15) {
353 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
354 mrf++;
355 }
356 else {
357 if (mrf >= 15 && !c->first_overflow_output)
358 c->first_overflow_output = i;
359 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
360 reg++;
361 mrf++;
362 }
363 }
364 }
365 }
366
367 /* Allocate program temporaries:
368 */
369 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
370 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
371 reg++;
372 }
373
374 /* Address reg(s). Don't try to use the internal address reg until
375 * deref time.
376 */
377 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
378 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
379 reg,
380 0,
381 BRW_REGISTER_TYPE_D,
382 BRW_VERTICAL_STRIDE_8,
383 BRW_WIDTH_8,
384 BRW_HORIZONTAL_STRIDE_1,
385 BRW_SWIZZLE_XXXX,
386 WRITEMASK_X);
387 reg++;
388 }
389
390 if (c->vp->use_const_buffer) {
391 for (i = 0; i < 3; i++) {
392 c->current_const[i].reg = brw_vec8_grf(reg, 0);
393 reg++;
394 }
395 clear_current_const(c);
396 }
397
398 for (i = 0; i < 128; i++) {
399 if (c->output_regs[i].used_in_src) {
400 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
401 reg++;
402 }
403 }
404
405 if (c->needs_stack) {
406 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
407 reg += 2;
408 }
409
410 /* Some opcodes need an internal temporary:
411 */
412 c->first_tmp = reg;
413 c->last_tmp = reg; /* for allocation purposes */
414
415 /* Each input reg holds data from two vertices. The
416 * urb_read_length is the number of registers read from *each*
417 * vertex urb, so is half the amount:
418 */
419 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
420 /* Setting this field to 0 leads to undefined behavior according to the
421 * the VS_STATE docs. Our VUEs will always have at least one attribute
422 * sitting in them, even if it's padding.
423 */
424 if (c->prog_data.urb_read_length == 0)
425 c->prog_data.urb_read_length = 1;
426
427 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
428 * them to fit the biggest thing they need to.
429 */
430 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
431
432 /* See emit_vertex_write() for where the VUE's overhead on top of the
433 * attributes comes from.
434 */
435 if (intel->gen >= 7) {
436 int header_regs = 2;
437 if (c->key.nr_userclip)
438 header_regs += 2;
439
440 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
441 * number of 64-byte (512-bit) units.
442 */
443 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 3) / 4;
444 } else if (intel->gen == 6) {
445 int header_regs = 2;
446 if (c->key.nr_userclip)
447 header_regs += 2;
448
449 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
450 * number of 128-byte (1024-bit) units.
451 */
452 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
453 } else if (intel->gen == 5)
454 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
455 * number of 64-byte (512-bit) units.
456 */
457 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
458 else
459 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
460
461 c->prog_data.total_grf = reg;
462
463 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
464 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
465 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
466 printf("%s reg = %d\n", __FUNCTION__, reg);
467 }
468 }
469
470
471 /**
472 * If an instruction uses a temp reg both as a src and the dest, we
473 * sometimes need to allocate an intermediate temporary.
474 */
475 static void unalias1( struct brw_vs_compile *c,
476 struct brw_reg dst,
477 struct brw_reg arg0,
478 void (*func)( struct brw_vs_compile *,
479 struct brw_reg,
480 struct brw_reg ))
481 {
482 if (dst.file == arg0.file && dst.nr == arg0.nr) {
483 struct brw_compile *p = &c->func;
484 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
485 func(c, tmp, arg0);
486 brw_MOV(p, dst, tmp);
487 release_tmp(c, tmp);
488 }
489 else {
490 func(c, dst, arg0);
491 }
492 }
493
494 /**
495 * \sa unalias2
496 * Checkes if 2-operand instruction needs an intermediate temporary.
497 */
498 static void unalias2( struct brw_vs_compile *c,
499 struct brw_reg dst,
500 struct brw_reg arg0,
501 struct brw_reg arg1,
502 void (*func)( struct brw_vs_compile *,
503 struct brw_reg,
504 struct brw_reg,
505 struct brw_reg ))
506 {
507 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
508 (dst.file == arg1.file && dst.nr == arg1.nr)) {
509 struct brw_compile *p = &c->func;
510 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
511 func(c, tmp, arg0, arg1);
512 brw_MOV(p, dst, tmp);
513 release_tmp(c, tmp);
514 }
515 else {
516 func(c, dst, arg0, arg1);
517 }
518 }
519
520 /**
521 * \sa unalias2
522 * Checkes if 3-operand instruction needs an intermediate temporary.
523 */
524 static void unalias3( struct brw_vs_compile *c,
525 struct brw_reg dst,
526 struct brw_reg arg0,
527 struct brw_reg arg1,
528 struct brw_reg arg2,
529 void (*func)( struct brw_vs_compile *,
530 struct brw_reg,
531 struct brw_reg,
532 struct brw_reg,
533 struct brw_reg ))
534 {
535 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
536 (dst.file == arg1.file && dst.nr == arg1.nr) ||
537 (dst.file == arg2.file && dst.nr == arg2.nr)) {
538 struct brw_compile *p = &c->func;
539 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
540 func(c, tmp, arg0, arg1, arg2);
541 brw_MOV(p, dst, tmp);
542 release_tmp(c, tmp);
543 }
544 else {
545 func(c, dst, arg0, arg1, arg2);
546 }
547 }
548
549 static void emit_sop( struct brw_vs_compile *c,
550 struct brw_reg dst,
551 struct brw_reg arg0,
552 struct brw_reg arg1,
553 GLuint cond)
554 {
555 struct brw_compile *p = &c->func;
556
557 brw_MOV(p, dst, brw_imm_f(0.0f));
558 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
559 brw_MOV(p, dst, brw_imm_f(1.0f));
560 brw_set_predicate_control_flag_value(p, 0xff);
561 }
562
563 static void emit_seq( struct brw_vs_compile *c,
564 struct brw_reg dst,
565 struct brw_reg arg0,
566 struct brw_reg arg1 )
567 {
568 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
569 }
570
571 static void emit_sne( struct brw_vs_compile *c,
572 struct brw_reg dst,
573 struct brw_reg arg0,
574 struct brw_reg arg1 )
575 {
576 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
577 }
578 static void emit_slt( struct brw_vs_compile *c,
579 struct brw_reg dst,
580 struct brw_reg arg0,
581 struct brw_reg arg1 )
582 {
583 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
584 }
585
586 static void emit_sle( struct brw_vs_compile *c,
587 struct brw_reg dst,
588 struct brw_reg arg0,
589 struct brw_reg arg1 )
590 {
591 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
592 }
593
594 static void emit_sgt( struct brw_vs_compile *c,
595 struct brw_reg dst,
596 struct brw_reg arg0,
597 struct brw_reg arg1 )
598 {
599 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
600 }
601
602 static void emit_sge( struct brw_vs_compile *c,
603 struct brw_reg dst,
604 struct brw_reg arg0,
605 struct brw_reg arg1 )
606 {
607 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
608 }
609
610 static void emit_cmp( struct brw_compile *p,
611 struct brw_reg dst,
612 struct brw_reg arg0,
613 struct brw_reg arg1,
614 struct brw_reg arg2 )
615 {
616 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
617 brw_SEL(p, dst, arg1, arg2);
618 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
619 }
620
621 static void emit_sign(struct brw_vs_compile *c,
622 struct brw_reg dst,
623 struct brw_reg arg0)
624 {
625 struct brw_compile *p = &c->func;
626
627 brw_MOV(p, dst, brw_imm_f(0));
628
629 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
630 brw_MOV(p, dst, brw_imm_f(-1.0));
631 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
632
633 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
634 brw_MOV(p, dst, brw_imm_f(1.0));
635 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
636 }
637
638 static void emit_max( struct brw_compile *p,
639 struct brw_reg dst,
640 struct brw_reg arg0,
641 struct brw_reg arg1 )
642 {
643 struct intel_context *intel = &p->brw->intel;
644
645 if (intel->gen >= 6) {
646 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
647 brw_SEL(p, dst, arg0, arg1);
648 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
649 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
650 } else {
651 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
652 brw_SEL(p, dst, arg0, arg1);
653 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
654 }
655 }
656
657 static void emit_min( struct brw_compile *p,
658 struct brw_reg dst,
659 struct brw_reg arg0,
660 struct brw_reg arg1 )
661 {
662 struct intel_context *intel = &p->brw->intel;
663
664 if (intel->gen >= 6) {
665 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
666 brw_SEL(p, dst, arg0, arg1);
667 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
668 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
669 } else {
670 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
671 brw_SEL(p, dst, arg0, arg1);
672 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
673 }
674 }
675
676 static void emit_arl(struct brw_compile *p,
677 struct brw_reg dst,
678 struct brw_reg src)
679 {
680 struct intel_context *intel = &p->brw->intel;
681
682 if (intel->gen >= 6) {
683 struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
684
685 brw_RNDD(p, dst_f, src);
686 brw_MOV(p, dst, dst_f);
687 } else {
688 brw_RNDD(p, dst, src);
689 }
690 }
691
692 static void emit_math1_gen4(struct brw_vs_compile *c,
693 GLuint function,
694 struct brw_reg dst,
695 struct brw_reg arg0,
696 GLuint precision)
697 {
698 /* There are various odd behaviours with SEND on the simulator. In
699 * addition there are documented issues with the fact that the GEN4
700 * processor doesn't do dependency control properly on SEND
701 * results. So, on balance, this kludge to get around failures
702 * with writemasked math results looks like it might be necessary
703 * whether that turns out to be a simulator bug or not:
704 */
705 struct brw_compile *p = &c->func;
706 struct brw_reg tmp = dst;
707 GLboolean need_tmp = GL_FALSE;
708
709 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
710 dst.dw1.bits.writemask != 0xf)
711 need_tmp = GL_TRUE;
712
713 if (need_tmp)
714 tmp = get_tmp(c);
715
716 brw_math(p,
717 tmp,
718 function,
719 BRW_MATH_SATURATE_NONE,
720 2,
721 arg0,
722 BRW_MATH_DATA_SCALAR,
723 precision);
724
725 if (need_tmp) {
726 brw_MOV(p, dst, tmp);
727 release_tmp(c, tmp);
728 }
729 }
730
731 static void
732 emit_math1_gen6(struct brw_vs_compile *c,
733 GLuint function,
734 struct brw_reg dst,
735 struct brw_reg arg0,
736 GLuint precision)
737 {
738 struct brw_compile *p = &c->func;
739 struct brw_reg tmp_src, tmp_dst;
740
741 /* Something is strange on gen6 math in 16-wide mode, though the
742 * docs say it's supposed to work. Punt to using align1 mode,
743 * which doesn't do writemasking and swizzles.
744 */
745 tmp_src = get_tmp(c);
746 tmp_dst = get_tmp(c);
747
748 brw_MOV(p, tmp_src, arg0);
749
750 brw_set_access_mode(p, BRW_ALIGN_1);
751 brw_math(p,
752 tmp_dst,
753 function,
754 BRW_MATH_SATURATE_NONE,
755 2,
756 tmp_src,
757 BRW_MATH_DATA_SCALAR,
758 precision);
759 brw_set_access_mode(p, BRW_ALIGN_16);
760
761 brw_MOV(p, dst, tmp_dst);
762
763 release_tmp(c, tmp_src);
764 release_tmp(c, tmp_dst);
765 }
766
767 static void
768 emit_math1(struct brw_vs_compile *c,
769 GLuint function,
770 struct brw_reg dst,
771 struct brw_reg arg0,
772 GLuint precision)
773 {
774 struct brw_compile *p = &c->func;
775 struct intel_context *intel = &p->brw->intel;
776
777 if (intel->gen >= 6)
778 emit_math1_gen6(c, function, dst, arg0, precision);
779 else
780 emit_math1_gen4(c, function, dst, arg0, precision);
781 }
782
783 static void emit_math2_gen4( struct brw_vs_compile *c,
784 GLuint function,
785 struct brw_reg dst,
786 struct brw_reg arg0,
787 struct brw_reg arg1,
788 GLuint precision)
789 {
790 struct brw_compile *p = &c->func;
791 struct brw_reg tmp = dst;
792 GLboolean need_tmp = GL_FALSE;
793
794 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
795 dst.dw1.bits.writemask != 0xf)
796 need_tmp = GL_TRUE;
797
798 if (need_tmp)
799 tmp = get_tmp(c);
800
801 brw_MOV(p, brw_message_reg(3), arg1);
802
803 brw_math(p,
804 tmp,
805 function,
806 BRW_MATH_SATURATE_NONE,
807 2,
808 arg0,
809 BRW_MATH_DATA_SCALAR,
810 precision);
811
812 if (need_tmp) {
813 brw_MOV(p, dst, tmp);
814 release_tmp(c, tmp);
815 }
816 }
817
818 static void emit_math2_gen6( struct brw_vs_compile *c,
819 GLuint function,
820 struct brw_reg dst,
821 struct brw_reg arg0,
822 struct brw_reg arg1,
823 GLuint precision)
824 {
825 struct brw_compile *p = &c->func;
826 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
827
828 tmp_src0 = get_tmp(c);
829 tmp_src1 = get_tmp(c);
830 tmp_dst = get_tmp(c);
831
832 brw_MOV(p, tmp_src0, arg0);
833 brw_MOV(p, tmp_src1, arg1);
834
835 brw_set_access_mode(p, BRW_ALIGN_1);
836 brw_math2(p,
837 tmp_dst,
838 function,
839 tmp_src0,
840 tmp_src1);
841 brw_set_access_mode(p, BRW_ALIGN_16);
842
843 brw_MOV(p, dst, tmp_dst);
844
845 release_tmp(c, tmp_src0);
846 release_tmp(c, tmp_src1);
847 release_tmp(c, tmp_dst);
848 }
849
850 static void emit_math2( struct brw_vs_compile *c,
851 GLuint function,
852 struct brw_reg dst,
853 struct brw_reg arg0,
854 struct brw_reg arg1,
855 GLuint precision)
856 {
857 struct brw_compile *p = &c->func;
858 struct intel_context *intel = &p->brw->intel;
859
860 if (intel->gen >= 6)
861 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
862 else
863 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
864 }
865
866 static void emit_exp_noalias( struct brw_vs_compile *c,
867 struct brw_reg dst,
868 struct brw_reg arg0 )
869 {
870 struct brw_compile *p = &c->func;
871
872
873 if (dst.dw1.bits.writemask & WRITEMASK_X) {
874 struct brw_reg tmp = get_tmp(c);
875 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
876
877 /* tmp_d = floor(arg0.x) */
878 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
879
880 /* result[0] = 2.0 ^ tmp */
881
882 /* Adjust exponent for floating point:
883 * exp += 127
884 */
885 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
886
887 /* Install exponent and sign.
888 * Excess drops off the edge:
889 */
890 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
891 tmp_d, brw_imm_d(23));
892
893 release_tmp(c, tmp);
894 }
895
896 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
897 /* result[1] = arg0.x - floor(arg0.x) */
898 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
899 }
900
901 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
902 /* As with the LOG instruction, we might be better off just
903 * doing a taylor expansion here, seeing as we have to do all
904 * the prep work.
905 *
906 * If mathbox partial precision is too low, consider also:
907 * result[3] = result[0] * EXP(result[1])
908 */
909 emit_math1(c,
910 BRW_MATH_FUNCTION_EXP,
911 brw_writemask(dst, WRITEMASK_Z),
912 brw_swizzle1(arg0, 0),
913 BRW_MATH_PRECISION_FULL);
914 }
915
916 if (dst.dw1.bits.writemask & WRITEMASK_W) {
917 /* result[3] = 1.0; */
918 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
919 }
920 }
921
922
923 static void emit_log_noalias( struct brw_vs_compile *c,
924 struct brw_reg dst,
925 struct brw_reg arg0 )
926 {
927 struct brw_compile *p = &c->func;
928 struct brw_reg tmp = dst;
929 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
930 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
931 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
932 dst.file != BRW_GENERAL_REGISTER_FILE);
933
934 if (need_tmp) {
935 tmp = get_tmp(c);
936 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
937 }
938
939 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
940 * according to spec:
941 *
942 * These almost look likey they could be joined up, but not really
943 * practical:
944 *
945 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
946 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
947 */
948 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
949 brw_AND(p,
950 brw_writemask(tmp_ud, WRITEMASK_X),
951 brw_swizzle1(arg0_ud, 0),
952 brw_imm_ud((1U<<31)-1));
953
954 brw_SHR(p,
955 brw_writemask(tmp_ud, WRITEMASK_X),
956 tmp_ud,
957 brw_imm_ud(23));
958
959 brw_ADD(p,
960 brw_writemask(tmp, WRITEMASK_X),
961 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
962 brw_imm_d(-127));
963 }
964
965 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
966 brw_AND(p,
967 brw_writemask(tmp_ud, WRITEMASK_Y),
968 brw_swizzle1(arg0_ud, 0),
969 brw_imm_ud((1<<23)-1));
970
971 brw_OR(p,
972 brw_writemask(tmp_ud, WRITEMASK_Y),
973 tmp_ud,
974 brw_imm_ud(127<<23));
975 }
976
977 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
978 /* result[2] = result[0] + LOG2(result[1]); */
979
980 /* Why bother? The above is just a hint how to do this with a
981 * taylor series. Maybe we *should* use a taylor series as by
982 * the time all the above has been done it's almost certainly
983 * quicker than calling the mathbox, even with low precision.
984 *
985 * Options are:
986 * - result[0] + mathbox.LOG2(result[1])
987 * - mathbox.LOG2(arg0.x)
988 * - result[0] + inline_taylor_approx(result[1])
989 */
990 emit_math1(c,
991 BRW_MATH_FUNCTION_LOG,
992 brw_writemask(tmp, WRITEMASK_Z),
993 brw_swizzle1(tmp, 1),
994 BRW_MATH_PRECISION_FULL);
995
996 brw_ADD(p,
997 brw_writemask(tmp, WRITEMASK_Z),
998 brw_swizzle1(tmp, 2),
999 brw_swizzle1(tmp, 0));
1000 }
1001
1002 if (dst.dw1.bits.writemask & WRITEMASK_W) {
1003 /* result[3] = 1.0; */
1004 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
1005 }
1006
1007 if (need_tmp) {
1008 brw_MOV(p, dst, tmp);
1009 release_tmp(c, tmp);
1010 }
1011 }
1012
1013
1014 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1015 */
1016 static void emit_dst_noalias( struct brw_vs_compile *c,
1017 struct brw_reg dst,
1018 struct brw_reg arg0,
1019 struct brw_reg arg1)
1020 {
1021 struct brw_compile *p = &c->func;
1022
1023 /* There must be a better way to do this:
1024 */
1025 if (dst.dw1.bits.writemask & WRITEMASK_X)
1026 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
1027 if (dst.dw1.bits.writemask & WRITEMASK_Y)
1028 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
1029 if (dst.dw1.bits.writemask & WRITEMASK_Z)
1030 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
1031 if (dst.dw1.bits.writemask & WRITEMASK_W)
1032 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
1033 }
1034
1035
1036 static void emit_xpd( struct brw_compile *p,
1037 struct brw_reg dst,
1038 struct brw_reg t,
1039 struct brw_reg u)
1040 {
1041 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
1042 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1043 }
1044
1045
1046 static void emit_lit_noalias( struct brw_vs_compile *c,
1047 struct brw_reg dst,
1048 struct brw_reg arg0 )
1049 {
1050 struct brw_compile *p = &c->func;
1051 struct brw_reg tmp = dst;
1052 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1053
1054 if (need_tmp)
1055 tmp = get_tmp(c);
1056
1057 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1058 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1059
1060 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1061 * to get all channels active inside the IF. In the clipping code
1062 * we run with NoMask, so it's not an option and we can use
1063 * BRW_EXECUTE_1 for all comparisions.
1064 */
1065 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1066 brw_IF(p, BRW_EXECUTE_8);
1067 {
1068 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1069
1070 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1071 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1072 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1073
1074 emit_math2(c,
1075 BRW_MATH_FUNCTION_POW,
1076 brw_writemask(dst, WRITEMASK_Z),
1077 brw_swizzle1(tmp, 2),
1078 brw_swizzle1(arg0, 3),
1079 BRW_MATH_PRECISION_PARTIAL);
1080 }
1081 brw_ENDIF(p);
1082
1083 release_tmp(c, tmp);
1084 }
1085
1086 static void emit_lrp_noalias(struct brw_vs_compile *c,
1087 struct brw_reg dst,
1088 struct brw_reg arg0,
1089 struct brw_reg arg1,
1090 struct brw_reg arg2)
1091 {
1092 struct brw_compile *p = &c->func;
1093
1094 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1095 brw_MUL(p, brw_null_reg(), dst, arg2);
1096 brw_MAC(p, dst, arg0, arg1);
1097 }
1098
1099 /** 3 or 4-component vector normalization */
1100 static void emit_nrm( struct brw_vs_compile *c,
1101 struct brw_reg dst,
1102 struct brw_reg arg0,
1103 int num_comps)
1104 {
1105 struct brw_compile *p = &c->func;
1106 struct brw_reg tmp = get_tmp(c);
1107
1108 /* tmp = dot(arg0, arg0) */
1109 if (num_comps == 3)
1110 brw_DP3(p, tmp, arg0, arg0);
1111 else
1112 brw_DP4(p, tmp, arg0, arg0);
1113
1114 /* tmp = 1 / sqrt(tmp) */
1115 emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
1116
1117 /* dst = arg0 * tmp */
1118 brw_MUL(p, dst, arg0, tmp);
1119
1120 release_tmp(c, tmp);
1121 }
1122
1123
1124 static struct brw_reg
1125 get_constant(struct brw_vs_compile *c,
1126 const struct prog_instruction *inst,
1127 GLuint argIndex)
1128 {
1129 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1130 struct brw_compile *p = &c->func;
1131 struct brw_reg const_reg = c->current_const[argIndex].reg;
1132
1133 assert(argIndex < 3);
1134
1135 if (c->current_const[argIndex].index != src->Index) {
1136 /* Keep track of the last constant loaded in this slot, for reuse. */
1137 c->current_const[argIndex].index = src->Index;
1138
1139 #if 0
1140 printf(" fetch const[%d] for arg %d into reg %d\n",
1141 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1142 #endif
1143 /* need to fetch the constant now */
1144 brw_dp_READ_4_vs(p,
1145 const_reg, /* writeback dest */
1146 16 * src->Index, /* byte offset */
1147 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1148 );
1149 }
1150
1151 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1152 const_reg = stride(const_reg, 0, 4, 1);
1153 const_reg.subnr = 0;
1154
1155 return const_reg;
1156 }
1157
1158 static struct brw_reg
1159 get_reladdr_constant(struct brw_vs_compile *c,
1160 const struct prog_instruction *inst,
1161 GLuint argIndex)
1162 {
1163 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1164 struct brw_compile *p = &c->func;
1165 struct brw_context *brw = p->brw;
1166 struct intel_context *intel = &brw->intel;
1167 struct brw_reg const_reg = c->current_const[argIndex].reg;
1168 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1169 uint32_t offset;
1170
1171 assert(argIndex < 3);
1172
1173 /* Can't reuse a reladdr constant load. */
1174 c->current_const[argIndex].index = -1;
1175
1176 #if 0
1177 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1178 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1179 #endif
1180
1181 if (intel->gen >= 6) {
1182 offset = src->Index;
1183 } else {
1184 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1185 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1186 addr_reg = byte_addr_reg;
1187 offset = 16 * src->Index;
1188 }
1189
1190 /* fetch the first vec4 */
1191 brw_dp_READ_4_vs_relative(p,
1192 const_reg,
1193 addr_reg,
1194 offset,
1195 SURF_INDEX_VERT_CONST_BUFFER);
1196
1197 return const_reg;
1198 }
1199
1200
1201
1202 /* TODO: relative addressing!
1203 */
1204 static struct brw_reg get_reg( struct brw_vs_compile *c,
1205 gl_register_file file,
1206 GLuint index )
1207 {
1208 switch (file) {
1209 case PROGRAM_TEMPORARY:
1210 case PROGRAM_INPUT:
1211 case PROGRAM_OUTPUT:
1212 assert(c->regs[file][index].nr != 0);
1213 return c->regs[file][index];
1214 case PROGRAM_STATE_VAR:
1215 case PROGRAM_CONSTANT:
1216 case PROGRAM_UNIFORM:
1217 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1218 return c->regs[PROGRAM_STATE_VAR][index];
1219 case PROGRAM_ADDRESS:
1220 assert(index == 0);
1221 return c->regs[file][index];
1222
1223 case PROGRAM_UNDEFINED: /* undef values */
1224 return brw_null_reg();
1225
1226 case PROGRAM_LOCAL_PARAM:
1227 case PROGRAM_ENV_PARAM:
1228 case PROGRAM_WRITE_ONLY:
1229 default:
1230 assert(0);
1231 return brw_null_reg();
1232 }
1233 }
1234
1235
1236 /**
1237 * Indirect addressing: get reg[[arg] + offset].
1238 */
1239 static struct brw_reg deref( struct brw_vs_compile *c,
1240 struct brw_reg arg,
1241 GLint offset,
1242 GLuint reg_size )
1243 {
1244 struct brw_compile *p = &c->func;
1245 struct brw_reg tmp = get_tmp(c);
1246 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1247 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1248 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1249 struct brw_reg indirect = brw_vec4_indirect(0,0);
1250 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1251
1252 /* Set the vertical stride on the register access so that the first
1253 * 4 components come from a0.0 and the second 4 from a0.1.
1254 */
1255 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1256
1257 {
1258 brw_push_insn_state(p);
1259 brw_set_access_mode(p, BRW_ALIGN_1);
1260
1261 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1262 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1263
1264 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1265 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1266
1267 brw_MOV(p, tmp, indirect);
1268
1269 brw_pop_insn_state(p);
1270 }
1271
1272 /* NOTE: tmp not released */
1273 return tmp;
1274 }
1275
1276 static void
1277 move_to_reladdr_dst(struct brw_vs_compile *c,
1278 const struct prog_instruction *inst,
1279 struct brw_reg val)
1280 {
1281 struct brw_compile *p = &c->func;
1282 int reg_size = 32;
1283 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1284 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1285 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1286 GLuint byte_offset = base.nr * 32 + base.subnr;
1287 struct brw_reg indirect = brw_vec4_indirect(0,0);
1288 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1289
1290 /* Because destination register indirect addressing can only use
1291 * one index, we'll write each vertex's vec4 value separately.
1292 */
1293 val.width = BRW_WIDTH_4;
1294 val.vstride = BRW_VERTICAL_STRIDE_4;
1295
1296 brw_push_insn_state(p);
1297 brw_set_access_mode(p, BRW_ALIGN_1);
1298
1299 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1300 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1301 brw_MOV(p, indirect, val);
1302
1303 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1304 brw_ADD(p, brw_address_reg(0), acc,
1305 brw_imm_uw(byte_offset + reg_size / 2));
1306 brw_MOV(p, indirect, suboffset(val, 4));
1307
1308 brw_pop_insn_state(p);
1309 }
1310
1311 /**
1312 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1313 * TODO: relative addressing!
1314 */
1315 static struct brw_reg
1316 get_src_reg( struct brw_vs_compile *c,
1317 const struct prog_instruction *inst,
1318 GLuint argIndex )
1319 {
1320 const GLuint file = inst->SrcReg[argIndex].File;
1321 const GLint index = inst->SrcReg[argIndex].Index;
1322 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1323
1324 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1325 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1326
1327 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1328 SWIZZLE_ZERO,
1329 SWIZZLE_ZERO,
1330 SWIZZLE_ZERO)) {
1331 return brw_imm_f(0.0f);
1332 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1333 SWIZZLE_ONE,
1334 SWIZZLE_ONE,
1335 SWIZZLE_ONE)) {
1336 if (src->Negate)
1337 return brw_imm_f(-1.0F);
1338 else
1339 return brw_imm_f(1.0F);
1340 } else if (src->File == PROGRAM_CONSTANT) {
1341 const struct gl_program_parameter_list *params;
1342 float f;
1343 int component = -1;
1344
1345 switch (src->Swizzle) {
1346 case SWIZZLE_XXXX:
1347 component = 0;
1348 break;
1349 case SWIZZLE_YYYY:
1350 component = 1;
1351 break;
1352 case SWIZZLE_ZZZZ:
1353 component = 2;
1354 break;
1355 case SWIZZLE_WWWW:
1356 component = 3;
1357 break;
1358 }
1359
1360 if (component >= 0) {
1361 params = c->vp->program.Base.Parameters;
1362 f = params->ParameterValues[src->Index][component].f;
1363
1364 if (src->Abs)
1365 f = fabs(f);
1366 if (src->Negate)
1367 f = -f;
1368 return brw_imm_f(f);
1369 }
1370 }
1371 }
1372
1373 switch (file) {
1374 case PROGRAM_TEMPORARY:
1375 case PROGRAM_INPUT:
1376 case PROGRAM_OUTPUT:
1377 if (relAddr) {
1378 return deref(c, c->regs[file][0], index, 32);
1379 }
1380 else {
1381 assert(c->regs[file][index].nr != 0);
1382 return c->regs[file][index];
1383 }
1384
1385 case PROGRAM_STATE_VAR:
1386 case PROGRAM_CONSTANT:
1387 case PROGRAM_UNIFORM:
1388 case PROGRAM_ENV_PARAM:
1389 case PROGRAM_LOCAL_PARAM:
1390 if (!relAddr && c->constant_map[index] != -1) {
1391 /* Take from the push constant buffer if possible. */
1392 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1393 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1394 } else {
1395 /* Must be in the pull constant buffer then .*/
1396 assert(c->vp->use_const_buffer);
1397 if (relAddr)
1398 return get_reladdr_constant(c, inst, argIndex);
1399 else
1400 return get_constant(c, inst, argIndex);
1401 }
1402 case PROGRAM_ADDRESS:
1403 assert(index == 0);
1404 return c->regs[file][index];
1405
1406 case PROGRAM_UNDEFINED:
1407 /* this is a normal case since we loop over all three src args */
1408 return brw_null_reg();
1409
1410 case PROGRAM_WRITE_ONLY:
1411 default:
1412 assert(0);
1413 return brw_null_reg();
1414 }
1415 }
1416
1417 /**
1418 * Return the brw reg for the given instruction's src argument.
1419 * Will return mangled results for SWZ op. The emit_swz() function
1420 * ignores this result and recalculates taking extended swizzles into
1421 * account.
1422 */
1423 static struct brw_reg get_arg( struct brw_vs_compile *c,
1424 const struct prog_instruction *inst,
1425 GLuint argIndex )
1426 {
1427 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1428 struct brw_reg reg;
1429
1430 if (src->File == PROGRAM_UNDEFINED)
1431 return brw_null_reg();
1432
1433 reg = get_src_reg(c, inst, argIndex);
1434
1435 /* Convert 3-bit swizzle to 2-bit.
1436 */
1437 if (reg.file != BRW_IMMEDIATE_VALUE) {
1438 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1439 GET_SWZ(src->Swizzle, 1),
1440 GET_SWZ(src->Swizzle, 2),
1441 GET_SWZ(src->Swizzle, 3));
1442
1443 /* Note this is ok for non-swizzle ARB_vp instructions */
1444 reg.negate = src->Negate ? 1 : 0;
1445 }
1446
1447 return reg;
1448 }
1449
1450
1451 /**
1452 * Get brw register for the given program dest register.
1453 */
1454 static struct brw_reg get_dst( struct brw_vs_compile *c,
1455 struct prog_dst_register dst )
1456 {
1457 struct brw_reg reg;
1458
1459 switch (dst.File) {
1460 case PROGRAM_TEMPORARY:
1461 case PROGRAM_OUTPUT:
1462 /* register-indirect addressing is only 1x1, not VxH, for
1463 * destination regs. So, for RelAddr we'll return a temporary
1464 * for the dest and do a move of the result to the RelAddr
1465 * register after the instruction emit.
1466 */
1467 if (dst.RelAddr) {
1468 reg = get_tmp(c);
1469 } else {
1470 assert(c->regs[dst.File][dst.Index].nr != 0);
1471 reg = c->regs[dst.File][dst.Index];
1472 }
1473 break;
1474 case PROGRAM_ADDRESS:
1475 assert(dst.Index == 0);
1476 reg = c->regs[dst.File][dst.Index];
1477 break;
1478 case PROGRAM_UNDEFINED:
1479 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1480 reg = brw_null_reg();
1481 break;
1482 default:
1483 assert(0);
1484 reg = brw_null_reg();
1485 }
1486
1487 assert(reg.type != BRW_IMMEDIATE_VALUE);
1488 reg.dw1.bits.writemask = dst.WriteMask;
1489
1490 return reg;
1491 }
1492
1493
1494 static void emit_swz( struct brw_vs_compile *c,
1495 struct brw_reg dst,
1496 const struct prog_instruction *inst)
1497 {
1498 const GLuint argIndex = 0;
1499 const struct prog_src_register src = inst->SrcReg[argIndex];
1500 struct brw_compile *p = &c->func;
1501 GLuint zeros_mask = 0;
1502 GLuint ones_mask = 0;
1503 GLuint src_mask = 0;
1504 GLubyte src_swz[4];
1505 GLboolean need_tmp = (src.Negate &&
1506 dst.file != BRW_GENERAL_REGISTER_FILE);
1507 struct brw_reg tmp = dst;
1508 GLuint i;
1509
1510 if (need_tmp)
1511 tmp = get_tmp(c);
1512
1513 for (i = 0; i < 4; i++) {
1514 if (dst.dw1.bits.writemask & (1<<i)) {
1515 GLubyte s = GET_SWZ(src.Swizzle, i);
1516 switch (s) {
1517 case SWIZZLE_X:
1518 case SWIZZLE_Y:
1519 case SWIZZLE_Z:
1520 case SWIZZLE_W:
1521 src_mask |= 1<<i;
1522 src_swz[i] = s;
1523 break;
1524 case SWIZZLE_ZERO:
1525 zeros_mask |= 1<<i;
1526 break;
1527 case SWIZZLE_ONE:
1528 ones_mask |= 1<<i;
1529 break;
1530 }
1531 }
1532 }
1533
1534 /* Do src first, in case dst aliases src:
1535 */
1536 if (src_mask) {
1537 struct brw_reg arg0;
1538
1539 arg0 = get_src_reg(c, inst, argIndex);
1540
1541 arg0 = brw_swizzle(arg0,
1542 src_swz[0], src_swz[1],
1543 src_swz[2], src_swz[3]);
1544
1545 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1546 }
1547
1548 if (zeros_mask)
1549 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1550
1551 if (ones_mask)
1552 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1553
1554 if (src.Negate)
1555 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1556
1557 if (need_tmp) {
1558 brw_MOV(p, dst, tmp);
1559 release_tmp(c, tmp);
1560 }
1561 }
1562
1563 static int
1564 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1565 {
1566 struct intel_context *intel = &brw->intel;
1567
1568 if (intel->gen >= 6) {
1569 /* URB data written (does not include the message header reg) must
1570 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1571 * section 5.4.3.2.2: URB_INTERLEAVED.
1572 *
1573 * URB entries are allocated on a multiple of 1024 bits, so an
1574 * extra 128 bits written here to make the end align to 256 is
1575 * no problem.
1576 */
1577 if ((mlen % 2) != 1)
1578 mlen++;
1579 }
1580
1581 return mlen;
1582 }
1583
1584 /**
1585 * Post-vertex-program processing. Send the results to the URB.
1586 */
1587 static void emit_vertex_write( struct brw_vs_compile *c)
1588 {
1589 struct brw_compile *p = &c->func;
1590 struct brw_context *brw = p->brw;
1591 struct intel_context *intel = &brw->intel;
1592 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1593 struct brw_reg ndc;
1594 int eot;
1595 GLuint len_vertex_header = 2;
1596 int next_mrf, i;
1597 int msg_len;
1598
1599 if (c->key.copy_edgeflag) {
1600 brw_MOV(p,
1601 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1602 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1603 }
1604
1605 if (intel->gen < 6) {
1606 /* Build ndc coords */
1607 ndc = get_tmp(c);
1608 /* ndc = 1.0 / pos.w */
1609 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1610 /* ndc.xyz = pos * ndc */
1611 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1612 }
1613
1614 /* Update the header for point size, user clipping flags, and -ve rhw
1615 * workaround.
1616 */
1617 if (intel->gen >= 6) {
1618 struct brw_reg m1 = brw_message_reg(1);
1619
1620 /* On gen6, m1 has each value in a separate dword, so we never
1621 * need to mess with a temporary for computing the m1 value.
1622 */
1623 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1624 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1625 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1626 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1627 }
1628
1629 /* Set the user clip distances in dword 8-15. (m3-4)*/
1630 if (c->key.nr_userclip) {
1631 for (i = 0; i < c->key.nr_userclip; i++) {
1632 struct brw_reg m;
1633 if (i < 4)
1634 m = brw_message_reg(3);
1635 else
1636 m = brw_message_reg(4);
1637
1638 brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1639 }
1640 }
1641 } else if ((c->prog_data.outputs_written &
1642 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1643 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1644 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1645 GLuint i;
1646
1647 brw_MOV(p, header1, brw_imm_ud(0));
1648
1649 brw_set_access_mode(p, BRW_ALIGN_16);
1650
1651 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1652 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1653 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1654 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1655 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1656 header1, brw_imm_ud(0x7ff<<8));
1657 }
1658
1659 for (i = 0; i < c->key.nr_userclip; i++) {
1660 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1661 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1662 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1663 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1664 }
1665
1666 /* i965 clipping workaround:
1667 * 1) Test for -ve rhw
1668 * 2) If set,
1669 * set ndc = (0,0,0,0)
1670 * set ucp[6] = 1
1671 *
1672 * Later, clipping will detect ucp[6] and ensure the primitive is
1673 * clipped against all fixed planes.
1674 */
1675 if (brw->has_negative_rhw_bug) {
1676 brw_CMP(p,
1677 vec8(brw_null_reg()),
1678 BRW_CONDITIONAL_L,
1679 brw_swizzle1(ndc, 3),
1680 brw_imm_f(0));
1681
1682 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1683 brw_MOV(p, ndc, brw_imm_f(0));
1684 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1685 }
1686
1687 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1688 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1689 brw_set_access_mode(p, BRW_ALIGN_16);
1690
1691 release_tmp(c, header1);
1692 }
1693 else {
1694 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1695 }
1696
1697 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1698 * of zeros followed by two sets of NDC coordinates:
1699 */
1700 brw_set_access_mode(p, BRW_ALIGN_1);
1701 brw_set_acc_write_control(p, 0);
1702
1703 /* The VUE layout is documented in Volume 2a. */
1704 if (intel->gen >= 6) {
1705 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1706 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1707 * dword 4-7 (m2) is the 4D space position
1708 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1709 * enabled.
1710 * m3 or 5 is the first vertex element data we fill, which is
1711 * the vertex position.
1712 */
1713 brw_MOV(p, brw_message_reg(2), pos);
1714 len_vertex_header = 1;
1715 if (c->key.nr_userclip > 0)
1716 len_vertex_header += 2;
1717 } else if (intel->gen == 5) {
1718 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1719 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1720 * dword 4-7 (m2) is the ndc position (set above)
1721 * dword 8-11 (m3) of the vertex header is the 4D space position
1722 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1723 * m6 is a pad so that the vertex element data is aligned
1724 * m7 is the first vertex data we fill, which is the vertex position.
1725 */
1726 brw_MOV(p, brw_message_reg(2), ndc);
1727 brw_MOV(p, brw_message_reg(3), pos);
1728 brw_MOV(p, brw_message_reg(7), pos);
1729 len_vertex_header = 6;
1730 } else {
1731 /* There are 8 dwords in VUE header pre-Ironlake:
1732 * dword 0-3 (m1) is indices, point width, clip flags.
1733 * dword 4-7 (m2) is ndc position (set above)
1734 *
1735 * dword 8-11 (m3) is the first vertex data, which we always have be the
1736 * vertex position.
1737 */
1738 brw_MOV(p, brw_message_reg(2), ndc);
1739 brw_MOV(p, brw_message_reg(3), pos);
1740 len_vertex_header = 2;
1741 }
1742
1743 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1744 next_mrf = 2 + len_vertex_header;
1745 for (i = 0; i < VERT_RESULT_MAX; i++) {
1746 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1747 break;
1748 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1749 continue;
1750 if (i == VERT_RESULT_PSIZ)
1751 continue;
1752
1753 if (i >= VERT_RESULT_TEX0 &&
1754 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1755 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1756 next_mrf++;
1757 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1758 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1759 }
1760 }
1761
1762 eot = (c->first_overflow_output == 0);
1763
1764 /* Message header, plus VUE header, plus the (first set of) outputs. */
1765 msg_len = 1 + len_vertex_header + c->nr_outputs;
1766 msg_len = align_interleaved_urb_mlen(brw, msg_len);
1767 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1768 msg_len = MIN2(msg_len, (BRW_MAX_MRF - 1)),
1769
1770 brw_urb_WRITE(p,
1771 brw_null_reg(), /* dest */
1772 0, /* starting mrf reg nr */
1773 c->r0, /* src */
1774 0, /* allocate */
1775 1, /* used */
1776 msg_len,
1777 0, /* response len */
1778 eot, /* eot */
1779 eot, /* writes complete */
1780 0, /* urb destination offset */
1781 BRW_URB_SWIZZLE_INTERLEAVE);
1782
1783 if (c->first_overflow_output > 0) {
1784 /* Not all of the vertex outputs/results fit into the MRF.
1785 * Move the overflowed attributes from the GRF to the MRF and
1786 * issue another brw_urb_WRITE().
1787 */
1788 GLuint i, mrf = 1;
1789 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1790 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1791 /* move from GRF to MRF */
1792 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1793 mrf++;
1794 }
1795 }
1796
1797 brw_urb_WRITE(p,
1798 brw_null_reg(), /* dest */
1799 0, /* starting mrf reg nr */
1800 c->r0, /* src */
1801 0, /* allocate */
1802 1, /* used */
1803 align_interleaved_urb_mlen(brw, mrf),
1804 0, /* response len */
1805 1, /* eot */
1806 1, /* writes complete */
1807 14 / 2, /* urb destination offset */
1808 BRW_URB_SWIZZLE_INTERLEAVE);
1809 }
1810 }
1811
1812 static GLboolean
1813 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1814 {
1815 struct brw_compile *p = &c->func;
1816 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1817
1818 if (p->nr_insn == 0)
1819 return GL_FALSE;
1820
1821 if (val.address_mode != BRW_ADDRESS_DIRECT)
1822 return GL_FALSE;
1823
1824 if (val.negate || val.abs)
1825 return GL_FALSE;
1826
1827 switch (prev_insn->header.opcode) {
1828 case BRW_OPCODE_MOV:
1829 case BRW_OPCODE_MAC:
1830 case BRW_OPCODE_MUL:
1831 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1832 prev_insn->header.execution_size == val.width &&
1833 prev_insn->bits1.da1.dest_reg_file == val.file &&
1834 prev_insn->bits1.da1.dest_reg_type == val.type &&
1835 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1836 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1837 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1838 prev_insn->bits1.da16.dest_writemask == 0xf)
1839 return GL_TRUE;
1840 else
1841 return GL_FALSE;
1842 default:
1843 return GL_FALSE;
1844 }
1845 }
1846
1847 static uint32_t
1848 get_predicate(const struct prog_instruction *inst)
1849 {
1850 if (inst->DstReg.CondMask == COND_TR)
1851 return BRW_PREDICATE_NONE;
1852
1853 /* All of GLSL only produces predicates for COND_NE and one channel per
1854 * vector. Fail badly if someone starts doing something else, as it might
1855 * mean infinite looping or something.
1856 *
1857 * We'd like to support all the condition codes, but our hardware doesn't
1858 * quite match the Mesa IR, which is modeled after the NV extensions. For
1859 * those, the instruction may update the condition codes or not, then any
1860 * later instruction may use one of those condition codes. For gen4, the
1861 * instruction may update the flags register based on one of the condition
1862 * codes output by the instruction, and then further instructions may
1863 * predicate on that. We can probably support this, but it won't
1864 * necessarily be easy.
1865 */
1866 assert(inst->DstReg.CondMask == COND_NE);
1867
1868 switch (inst->DstReg.CondSwizzle) {
1869 case SWIZZLE_XXXX:
1870 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1871 case SWIZZLE_YYYY:
1872 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1873 case SWIZZLE_ZZZZ:
1874 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1875 case SWIZZLE_WWWW:
1876 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1877 default:
1878 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1879 inst->DstReg.CondMask);
1880 return BRW_PREDICATE_NORMAL;
1881 }
1882 }
1883
1884 static void
1885 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1886 {
1887 struct brw_compile *p = &c->func;
1888 int i;
1889
1890 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1891 if (!(c->prog_data.inputs_read & (1 << i)))
1892 continue;
1893
1894 if (c->key.gl_fixed_input_size[i] != 0) {
1895 struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1896
1897 brw_MUL(p,
1898 brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1899 reg, brw_imm_f(1.0 / 65536.0));
1900 }
1901 }
1902 }
1903
1904 /* Emit the vertex program instructions here.
1905 */
1906 void brw_vs_emit(struct brw_vs_compile *c )
1907 {
1908 #define MAX_IF_DEPTH 32
1909 #define MAX_LOOP_DEPTH 32
1910 struct brw_compile *p = &c->func;
1911 struct brw_context *brw = p->brw;
1912 struct intel_context *intel = &brw->intel;
1913 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1914 GLuint insn, loop_depth = 0;
1915 struct brw_instruction *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1916 int if_depth_in_loop[MAX_LOOP_DEPTH];
1917 const struct brw_indirect stack_index = brw_indirect(0, 0);
1918 GLuint index;
1919 GLuint file;
1920
1921 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1922 printf("vs-mesa:\n");
1923 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1924 GL_TRUE);
1925 printf("\n");
1926 }
1927
1928 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1929 brw_set_access_mode(p, BRW_ALIGN_16);
1930 if_depth_in_loop[loop_depth] = 0;
1931
1932 brw_set_acc_write_control(p, 1);
1933
1934 for (insn = 0; insn < nr_insns; insn++) {
1935 GLuint i;
1936 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1937
1938 /* Message registers can't be read, so copy the output into GRF
1939 * register if they are used in source registers
1940 */
1941 for (i = 0; i < 3; i++) {
1942 struct prog_src_register *src = &inst->SrcReg[i];
1943 GLuint index = src->Index;
1944 GLuint file = src->File;
1945 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1946 c->output_regs[index].used_in_src = GL_TRUE;
1947 }
1948
1949 switch (inst->Opcode) {
1950 case OPCODE_CAL:
1951 case OPCODE_RET:
1952 c->needs_stack = GL_TRUE;
1953 break;
1954 default:
1955 break;
1956 }
1957 }
1958
1959 /* Static register allocation
1960 */
1961 brw_vs_alloc_regs(c);
1962
1963 brw_vs_rescale_gl_fixed(c);
1964
1965 if (c->needs_stack)
1966 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1967
1968 for (insn = 0; insn < nr_insns; insn++) {
1969
1970 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1971 struct brw_reg args[3], dst;
1972 GLuint i;
1973
1974 #if 0
1975 printf("%d: ", insn);
1976 _mesa_print_instruction(inst);
1977 #endif
1978
1979 /* Get argument regs. SWZ is special and does this itself.
1980 */
1981 if (inst->Opcode != OPCODE_SWZ)
1982 for (i = 0; i < 3; i++) {
1983 const struct prog_src_register *src = &inst->SrcReg[i];
1984 index = src->Index;
1985 file = src->File;
1986 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1987 /* Can't just make get_arg "do the right thing" here because
1988 * other callers of get_arg and get_src_reg don't expect any
1989 * special behavior for the c->output_regs[index].used_in_src
1990 * case.
1991 */
1992 args[i] = c->output_regs[index].reg;
1993 args[i].dw1.bits.swizzle =
1994 BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1995 GET_SWZ(src->Swizzle, 1),
1996 GET_SWZ(src->Swizzle, 2),
1997 GET_SWZ(src->Swizzle, 3));
1998
1999 /* Note this is ok for non-swizzle ARB_vp instructions */
2000 args[i].negate = src->Negate ? 1 : 0;
2001 } else
2002 args[i] = get_arg(c, inst, i);
2003 }
2004
2005 /* Get dest regs. Note that it is possible for a reg to be both
2006 * dst and arg, given the static allocation of registers. So
2007 * care needs to be taken emitting multi-operation instructions.
2008 */
2009 index = inst->DstReg.Index;
2010 file = inst->DstReg.File;
2011 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
2012 /* Can't just make get_dst "do the right thing" here because other
2013 * callers of get_dst don't expect any special behavior for the
2014 * c->output_regs[index].used_in_src case.
2015 */
2016 dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
2017 else
2018 dst = get_dst(c, inst->DstReg);
2019
2020 if (inst->SaturateMode != SATURATE_OFF) {
2021 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
2022 inst->SaturateMode);
2023 }
2024
2025 switch (inst->Opcode) {
2026 case OPCODE_ABS:
2027 args[0].negate = false;
2028 brw_MOV(p, dst, brw_abs(args[0]));
2029 break;
2030 case OPCODE_ADD:
2031 brw_ADD(p, dst, args[0], args[1]);
2032 break;
2033 case OPCODE_COS:
2034 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
2035 break;
2036 case OPCODE_DP2:
2037 brw_DP2(p, dst, args[0], args[1]);
2038 break;
2039 case OPCODE_DP3:
2040 brw_DP3(p, dst, args[0], args[1]);
2041 break;
2042 case OPCODE_DP4:
2043 brw_DP4(p, dst, args[0], args[1]);
2044 break;
2045 case OPCODE_DPH:
2046 brw_DPH(p, dst, args[0], args[1]);
2047 break;
2048 case OPCODE_NRM3:
2049 emit_nrm(c, dst, args[0], 3);
2050 break;
2051 case OPCODE_NRM4:
2052 emit_nrm(c, dst, args[0], 4);
2053 break;
2054 case OPCODE_DST:
2055 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
2056 break;
2057 case OPCODE_EXP:
2058 unalias1(c, dst, args[0], emit_exp_noalias);
2059 break;
2060 case OPCODE_EX2:
2061 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
2062 break;
2063 case OPCODE_ARL:
2064 emit_arl(p, dst, args[0]);
2065 break;
2066 case OPCODE_FLR:
2067 brw_RNDD(p, dst, args[0]);
2068 break;
2069 case OPCODE_FRC:
2070 brw_FRC(p, dst, args[0]);
2071 break;
2072 case OPCODE_LOG:
2073 unalias1(c, dst, args[0], emit_log_noalias);
2074 break;
2075 case OPCODE_LG2:
2076 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2077 break;
2078 case OPCODE_LIT:
2079 unalias1(c, dst, args[0], emit_lit_noalias);
2080 break;
2081 case OPCODE_LRP:
2082 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2083 break;
2084 case OPCODE_MAD:
2085 if (!accumulator_contains(c, args[2]))
2086 brw_MOV(p, brw_acc_reg(), args[2]);
2087 brw_MAC(p, dst, args[0], args[1]);
2088 break;
2089 case OPCODE_CMP:
2090 emit_cmp(p, dst, args[0], args[1], args[2]);
2091 break;
2092 case OPCODE_MAX:
2093 emit_max(p, dst, args[0], args[1]);
2094 break;
2095 case OPCODE_MIN:
2096 emit_min(p, dst, args[0], args[1]);
2097 break;
2098 case OPCODE_MOV:
2099 brw_MOV(p, dst, args[0]);
2100 break;
2101 case OPCODE_MUL:
2102 brw_MUL(p, dst, args[0], args[1]);
2103 break;
2104 case OPCODE_POW:
2105 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2106 break;
2107 case OPCODE_RCP:
2108 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2109 break;
2110 case OPCODE_RSQ:
2111 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2112 break;
2113
2114 case OPCODE_SEQ:
2115 unalias2(c, dst, args[0], args[1], emit_seq);
2116 break;
2117 case OPCODE_SIN:
2118 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2119 break;
2120 case OPCODE_SNE:
2121 unalias2(c, dst, args[0], args[1], emit_sne);
2122 break;
2123 case OPCODE_SGE:
2124 unalias2(c, dst, args[0], args[1], emit_sge);
2125 break;
2126 case OPCODE_SGT:
2127 unalias2(c, dst, args[0], args[1], emit_sgt);
2128 break;
2129 case OPCODE_SLT:
2130 unalias2(c, dst, args[0], args[1], emit_slt);
2131 break;
2132 case OPCODE_SLE:
2133 unalias2(c, dst, args[0], args[1], emit_sle);
2134 break;
2135 case OPCODE_SSG:
2136 unalias1(c, dst, args[0], emit_sign);
2137 break;
2138 case OPCODE_SUB:
2139 brw_ADD(p, dst, args[0], negate(args[1]));
2140 break;
2141 case OPCODE_SWZ:
2142 /* The args[0] value can't be used here as it won't have
2143 * correctly encoded the full swizzle:
2144 */
2145 emit_swz(c, dst, inst);
2146 break;
2147 case OPCODE_TRUNC:
2148 /* round toward zero */
2149 brw_RNDZ(p, dst, args[0]);
2150 break;
2151 case OPCODE_XPD:
2152 emit_xpd(p, dst, args[0], args[1]);
2153 break;
2154 case OPCODE_IF: {
2155 struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2156 /* Note that brw_IF smashes the predicate_control field. */
2157 if_inst->header.predicate_control = get_predicate(inst);
2158 if_depth_in_loop[loop_depth]++;
2159 break;
2160 }
2161 case OPCODE_ELSE:
2162 clear_current_const(c);
2163 brw_ELSE(p);
2164 break;
2165 case OPCODE_ENDIF:
2166 clear_current_const(c);
2167 brw_ENDIF(p);
2168 if_depth_in_loop[loop_depth]--;
2169 break;
2170 case OPCODE_BGNLOOP:
2171 clear_current_const(c);
2172 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2173 if_depth_in_loop[loop_depth] = 0;
2174 break;
2175 case OPCODE_BRK:
2176 brw_set_predicate_control(p, get_predicate(inst));
2177 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2178 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2179 break;
2180 case OPCODE_CONT:
2181 brw_set_predicate_control(p, get_predicate(inst));
2182 if (intel->gen >= 6) {
2183 gen6_CONT(p, loop_inst[loop_depth - 1]);
2184 } else {
2185 brw_CONT(p, if_depth_in_loop[loop_depth]);
2186 }
2187 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2188 break;
2189
2190 case OPCODE_ENDLOOP: {
2191 clear_current_const(c);
2192 struct brw_instruction *inst0, *inst1;
2193 GLuint br = 1;
2194
2195 loop_depth--;
2196
2197 if (intel->gen == 5)
2198 br = 2;
2199
2200 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2201
2202 if (intel->gen < 6) {
2203 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2204 while (inst0 > loop_inst[loop_depth]) {
2205 inst0--;
2206 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2207 inst0->bits3.if_else.jump_count == 0) {
2208 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2209 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2210 inst0->bits3.if_else.jump_count == 0) {
2211 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2212 }
2213 }
2214 }
2215 }
2216 break;
2217
2218 case OPCODE_BRA:
2219 brw_set_predicate_control(p, get_predicate(inst));
2220 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2221 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2222 break;
2223 case OPCODE_CAL:
2224 brw_set_access_mode(p, BRW_ALIGN_1);
2225 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2226 brw_set_access_mode(p, BRW_ALIGN_16);
2227 brw_ADD(p, get_addr_reg(stack_index),
2228 get_addr_reg(stack_index), brw_imm_d(4));
2229 brw_save_call(p, inst->Comment, p->nr_insn);
2230 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2231 break;
2232 case OPCODE_RET:
2233 brw_ADD(p, get_addr_reg(stack_index),
2234 get_addr_reg(stack_index), brw_imm_d(-4));
2235 brw_set_access_mode(p, BRW_ALIGN_1);
2236 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2237 brw_set_access_mode(p, BRW_ALIGN_16);
2238 break;
2239 case OPCODE_END:
2240 emit_vertex_write(c);
2241 break;
2242 case OPCODE_PRINT:
2243 /* no-op */
2244 break;
2245 case OPCODE_BGNSUB:
2246 brw_save_label(p, inst->Comment, p->nr_insn);
2247 break;
2248 case OPCODE_ENDSUB:
2249 /* no-op */
2250 break;
2251 default:
2252 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2253 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2254 _mesa_opcode_string(inst->Opcode) :
2255 "unknown");
2256 }
2257
2258 /* Set the predication update on the last instruction of the native
2259 * instruction sequence.
2260 *
2261 * This would be problematic if it was set on a math instruction,
2262 * but that shouldn't be the case with the current GLSL compiler.
2263 */
2264 if (inst->CondUpdate) {
2265 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2266
2267 assert(hw_insn->header.destreg__conditionalmod == 0);
2268 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2269 }
2270
2271 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2272 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2273 && c->output_regs[inst->DstReg.Index].used_in_src) {
2274 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2275 }
2276
2277 /* Result color clamping.
2278 *
2279 * When destination register is an output register and
2280 * it's primary/secondary front/back color, we have to clamp
2281 * the result to [0,1]. This is done by enabling the
2282 * saturation bit for the last instruction.
2283 *
2284 * We don't use brw_set_saturate() as it modifies
2285 * p->current->header.saturate, which affects all the subsequent
2286 * instructions. Instead, we directly modify the header
2287 * of the last (already stored) instruction.
2288 */
2289 if (inst->DstReg.File == PROGRAM_OUTPUT &&
2290 c->key.clamp_vertex_color) {
2291 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2292 || (inst->DstReg.Index == VERT_RESULT_COL1)
2293 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2294 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2295 p->store[p->nr_insn-1].header.saturate = 1;
2296 }
2297 }
2298
2299 if (inst->DstReg.RelAddr) {
2300 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2301 inst->DstReg.File == PROGRAM_OUTPUT);
2302 move_to_reladdr_dst(c, inst, dst);
2303 }
2304
2305 release_tmps(c);
2306 }
2307
2308 brw_resolve_cals(p);
2309 brw_set_uip_jip(p);
2310
2311 brw_optimize(p);
2312
2313 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2314 int i;
2315
2316 printf("vs-native:\n");
2317 for (i = 0; i < p->nr_insn; i++)
2318 brw_disasm(stdout, &p->store[i], intel->gen);
2319 printf("\n");
2320 }
2321 }