i965: Make the old VS backend record pull constant references in pull_params[].
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /**
136 * Preallocate GRF register before code emit.
137 * Do things as simply as possible. Allocate and populate all regs
138 * ahead of time.
139 */
140 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
141 {
142 struct intel_context *intel = &c->func.brw->intel;
143 GLuint i, reg = 0, mrf, j;
144 int attributes_in_vue;
145 int first_reladdr_output;
146 int max_constant;
147 int constant = 0;
148 int vert_result_reoder[VERT_RESULT_MAX];
149 int bfc = 0;
150 struct brw_vertex_program *vp = c->vp;
151 const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
152
153 /* Determine whether to use a real constant buffer or use a block
154 * of GRF registers for constants. The later is faster but only
155 * works if everything fits in the GRF.
156 * XXX this heuristic/check may need some fine tuning...
157 */
158 if (c->vp->program.Base.Parameters->NumParameters +
159 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
160 c->vp->use_const_buffer = GL_TRUE;
161 else
162 c->vp->use_const_buffer = GL_FALSE;
163
164 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
165
166 /* r0 -- reserved as usual
167 */
168 c->r0 = brw_vec8_grf(reg, 0);
169 reg++;
170
171 /* User clip planes from curbe:
172 */
173 if (c->key.nr_userclip) {
174 if (intel->gen >= 6) {
175 for (i = 0; i < c->key.nr_userclip; i++) {
176 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
177 (i % 2) * 4), 0, 4, 1);
178 }
179 reg += ALIGN(c->key.nr_userclip, 2) / 2;
180 } else {
181 for (i = 0; i < c->key.nr_userclip; i++) {
182 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
183 (i % 2) * 4), 0, 4, 1);
184 }
185 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
186 }
187
188 }
189
190 /* Assign some (probably all) of the vertex program constants to
191 * the push constant buffer/CURBE.
192 *
193 * There's an obvious limit to the numer of push constants equal to
194 * the number of register available, and that number is smaller
195 * than the minimum maximum number of vertex program parameters, so
196 * support for pull constants is required if we overflow.
197 * Additionally, on gen6 the number of push constants is even
198 * lower.
199 *
200 * When there's relative addressing, we don't know what range of
201 * Mesa IR registers can be accessed. And generally, when relative
202 * addressing is used we also have too many constants to load them
203 * all as push constants. So, we'll just support relative
204 * addressing out of the pull constant buffers, and try to load as
205 * many statically-accessed constants into the push constant buffer
206 * as we can.
207 */
208 if (intel->gen >= 6) {
209 /* We can only load 32 regs of push constants. */
210 max_constant = 32 * 2 - c->key.nr_userclip;
211 } else {
212 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
213 }
214
215 /* constant_map maps from ParameterValues[] index to index in the
216 * push constant buffer, or -1 if it's only in the pull constant
217 * buffer.
218 */
219 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
220 for (i = 0;
221 i < c->vp->program.Base.NumInstructions && constant < max_constant;
222 i++) {
223 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
224 int arg;
225
226 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
227 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
228 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
229 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
230 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
231 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
232 continue;
233 }
234
235 if (inst->SrcReg[arg].RelAddr) {
236 c->vp->use_const_buffer = GL_TRUE;
237 continue;
238 }
239
240 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
241 c->constant_map[inst->SrcReg[arg].Index] = constant++;
242 }
243 }
244 }
245
246 /* If we ran out of push constant space, then we'll also upload all
247 * constants through the pull constant buffer so that they can be
248 * accessed no matter what. For relative addressing (the common
249 * case) we need them all in place anyway.
250 */
251 if (constant == max_constant)
252 c->vp->use_const_buffer = GL_TRUE;
253
254 /* Set up the references to the pull parameters if present. This backend
255 * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
256 * buffer, while the new VS backend allocates values to the pull buffer on
257 * demand.
258 */
259 if (c->vp->use_const_buffer) {
260 for (i = 0; i < params->NumParameters * 4; i++) {
261 c->prog_data.pull_param[i] = &params->ParameterValues[i / 4][i % 4].f;
262 }
263 c->prog_data.nr_pull_params = i;
264 }
265
266 for (i = 0; i < constant; i++) {
267 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
268 (i % 2) * 4),
269 0, 4, 1);
270 }
271 reg += (constant + 1) / 2;
272 c->prog_data.curb_read_length = reg - 1;
273 c->prog_data.nr_params = constant * 4;
274 /* XXX 0 causes a bug elsewhere... */
275 if (intel->gen < 6 && c->prog_data.nr_params == 0)
276 c->prog_data.nr_params = 4;
277
278 /* Allocate input regs:
279 */
280 c->nr_inputs = 0;
281 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
282 if (c->prog_data.inputs_read & (1 << i)) {
283 c->nr_inputs++;
284 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
285 reg++;
286 }
287 }
288 /* If there are no inputs, we'll still be reading one attribute's worth
289 * because it's required -- see urb_read_length setting.
290 */
291 if (c->nr_inputs == 0)
292 reg++;
293
294 /* Allocate outputs. The non-position outputs go straight into message regs.
295 */
296 c->nr_outputs = 0;
297 c->first_output = reg;
298 c->first_overflow_output = 0;
299
300 if (intel->gen >= 6) {
301 mrf = 3;
302 if (c->key.nr_userclip)
303 mrf += 2;
304 } else if (intel->gen == 5)
305 mrf = 8;
306 else
307 mrf = 4;
308
309 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
310
311 for (i = 0; i < VERT_RESULT_MAX; i++)
312 vert_result_reoder[i] = i;
313
314 /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
315 if (intel->gen >= 6 && c->key.two_side_color) {
316 if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
317 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
318 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
319 assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
320 bfc = 2;
321 } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
322 (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
323 bfc = 1;
324
325 if (bfc) {
326 for (i = 0; i < bfc; i++) {
327 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
328 vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
329 }
330
331 for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
332 vert_result_reoder[i] = i - bfc;
333 }
334 }
335 }
336
337 for (j = 0; j < VERT_RESULT_MAX; j++) {
338 i = vert_result_reoder[j];
339
340 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
341 c->nr_outputs++;
342 assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
343 if (i == VERT_RESULT_HPOS) {
344 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
345 reg++;
346 }
347 else if (i == VERT_RESULT_PSIZ) {
348 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
349 reg++;
350 }
351 else {
352 /* Two restrictions on our compute-to-MRF here. The
353 * message length for all SEND messages is restricted to
354 * [1,15], so we can't use mrf 15, as that means a length
355 * of 16.
356 *
357 * Additionally, URB writes are aligned to URB rows, so we
358 * need to put an even number of registers of URB data in
359 * each URB write so that the later write is aligned. A
360 * message length of 15 means 1 message header reg plus 14
361 * regs of URB data.
362 *
363 * For attributes beyond the compute-to-MRF, we compute to
364 * GRFs and they will be written in the second URB_WRITE.
365 */
366 if (first_reladdr_output > i && mrf < 15) {
367 c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
368 mrf++;
369 }
370 else {
371 if (mrf >= 15 && !c->first_overflow_output)
372 c->first_overflow_output = i;
373 c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
374 reg++;
375 mrf++;
376 }
377 }
378 }
379 }
380
381 /* Allocate program temporaries:
382 */
383 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
384 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
385 reg++;
386 }
387
388 /* Address reg(s). Don't try to use the internal address reg until
389 * deref time.
390 */
391 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
392 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
393 reg,
394 0,
395 BRW_REGISTER_TYPE_D,
396 BRW_VERTICAL_STRIDE_8,
397 BRW_WIDTH_8,
398 BRW_HORIZONTAL_STRIDE_1,
399 BRW_SWIZZLE_XXXX,
400 WRITEMASK_X);
401 reg++;
402 }
403
404 if (c->vp->use_const_buffer) {
405 for (i = 0; i < 3; i++) {
406 c->current_const[i].reg = brw_vec8_grf(reg, 0);
407 reg++;
408 }
409 clear_current_const(c);
410 }
411
412 for (i = 0; i < 128; i++) {
413 if (c->output_regs[i].used_in_src) {
414 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
415 reg++;
416 }
417 }
418
419 if (c->needs_stack) {
420 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
421 reg += 2;
422 }
423
424 /* Some opcodes need an internal temporary:
425 */
426 c->first_tmp = reg;
427 c->last_tmp = reg; /* for allocation purposes */
428
429 /* Each input reg holds data from two vertices. The
430 * urb_read_length is the number of registers read from *each*
431 * vertex urb, so is half the amount:
432 */
433 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
434 /* Setting this field to 0 leads to undefined behavior according to the
435 * the VS_STATE docs. Our VUEs will always have at least one attribute
436 * sitting in them, even if it's padding.
437 */
438 if (c->prog_data.urb_read_length == 0)
439 c->prog_data.urb_read_length = 1;
440
441 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
442 * them to fit the biggest thing they need to.
443 */
444 attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
445
446 /* See emit_vertex_write() for where the VUE's overhead on top of the
447 * attributes comes from.
448 */
449 if (intel->gen >= 7) {
450 int header_regs = 2;
451 if (c->key.nr_userclip)
452 header_regs += 2;
453
454 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
455 * number of 64-byte (512-bit) units.
456 */
457 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 3) / 4;
458 } else if (intel->gen == 6) {
459 int header_regs = 2;
460 if (c->key.nr_userclip)
461 header_regs += 2;
462
463 /* Each attribute is 16 bytes (1 vec4), so dividing by 8 gives us the
464 * number of 128-byte (1024-bit) units.
465 */
466 c->prog_data.urb_entry_size = (attributes_in_vue + header_regs + 7) / 8;
467 } else if (intel->gen == 5)
468 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
469 * number of 64-byte (512-bit) units.
470 */
471 c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
472 else
473 c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
474
475 c->prog_data.total_grf = reg;
476
477 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
478 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
479 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
480 printf("%s reg = %d\n", __FUNCTION__, reg);
481 }
482 }
483
484
485 /**
486 * If an instruction uses a temp reg both as a src and the dest, we
487 * sometimes need to allocate an intermediate temporary.
488 */
489 static void unalias1( struct brw_vs_compile *c,
490 struct brw_reg dst,
491 struct brw_reg arg0,
492 void (*func)( struct brw_vs_compile *,
493 struct brw_reg,
494 struct brw_reg ))
495 {
496 if (dst.file == arg0.file && dst.nr == arg0.nr) {
497 struct brw_compile *p = &c->func;
498 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
499 func(c, tmp, arg0);
500 brw_MOV(p, dst, tmp);
501 release_tmp(c, tmp);
502 }
503 else {
504 func(c, dst, arg0);
505 }
506 }
507
508 /**
509 * \sa unalias2
510 * Checkes if 2-operand instruction needs an intermediate temporary.
511 */
512 static void unalias2( struct brw_vs_compile *c,
513 struct brw_reg dst,
514 struct brw_reg arg0,
515 struct brw_reg arg1,
516 void (*func)( struct brw_vs_compile *,
517 struct brw_reg,
518 struct brw_reg,
519 struct brw_reg ))
520 {
521 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
522 (dst.file == arg1.file && dst.nr == arg1.nr)) {
523 struct brw_compile *p = &c->func;
524 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
525 func(c, tmp, arg0, arg1);
526 brw_MOV(p, dst, tmp);
527 release_tmp(c, tmp);
528 }
529 else {
530 func(c, dst, arg0, arg1);
531 }
532 }
533
534 /**
535 * \sa unalias2
536 * Checkes if 3-operand instruction needs an intermediate temporary.
537 */
538 static void unalias3( struct brw_vs_compile *c,
539 struct brw_reg dst,
540 struct brw_reg arg0,
541 struct brw_reg arg1,
542 struct brw_reg arg2,
543 void (*func)( struct brw_vs_compile *,
544 struct brw_reg,
545 struct brw_reg,
546 struct brw_reg,
547 struct brw_reg ))
548 {
549 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
550 (dst.file == arg1.file && dst.nr == arg1.nr) ||
551 (dst.file == arg2.file && dst.nr == arg2.nr)) {
552 struct brw_compile *p = &c->func;
553 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
554 func(c, tmp, arg0, arg1, arg2);
555 brw_MOV(p, dst, tmp);
556 release_tmp(c, tmp);
557 }
558 else {
559 func(c, dst, arg0, arg1, arg2);
560 }
561 }
562
563 static void emit_sop( struct brw_vs_compile *c,
564 struct brw_reg dst,
565 struct brw_reg arg0,
566 struct brw_reg arg1,
567 GLuint cond)
568 {
569 struct brw_compile *p = &c->func;
570
571 brw_MOV(p, dst, brw_imm_f(0.0f));
572 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
573 brw_MOV(p, dst, brw_imm_f(1.0f));
574 brw_set_predicate_control_flag_value(p, 0xff);
575 }
576
577 static void emit_seq( struct brw_vs_compile *c,
578 struct brw_reg dst,
579 struct brw_reg arg0,
580 struct brw_reg arg1 )
581 {
582 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
583 }
584
585 static void emit_sne( struct brw_vs_compile *c,
586 struct brw_reg dst,
587 struct brw_reg arg0,
588 struct brw_reg arg1 )
589 {
590 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
591 }
592 static void emit_slt( struct brw_vs_compile *c,
593 struct brw_reg dst,
594 struct brw_reg arg0,
595 struct brw_reg arg1 )
596 {
597 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
598 }
599
600 static void emit_sle( struct brw_vs_compile *c,
601 struct brw_reg dst,
602 struct brw_reg arg0,
603 struct brw_reg arg1 )
604 {
605 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
606 }
607
608 static void emit_sgt( struct brw_vs_compile *c,
609 struct brw_reg dst,
610 struct brw_reg arg0,
611 struct brw_reg arg1 )
612 {
613 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
614 }
615
616 static void emit_sge( struct brw_vs_compile *c,
617 struct brw_reg dst,
618 struct brw_reg arg0,
619 struct brw_reg arg1 )
620 {
621 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
622 }
623
624 static void emit_cmp( struct brw_compile *p,
625 struct brw_reg dst,
626 struct brw_reg arg0,
627 struct brw_reg arg1,
628 struct brw_reg arg2 )
629 {
630 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
631 brw_SEL(p, dst, arg1, arg2);
632 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
633 }
634
635 static void emit_sign(struct brw_vs_compile *c,
636 struct brw_reg dst,
637 struct brw_reg arg0)
638 {
639 struct brw_compile *p = &c->func;
640
641 brw_MOV(p, dst, brw_imm_f(0));
642
643 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
644 brw_MOV(p, dst, brw_imm_f(-1.0));
645 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
646
647 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
648 brw_MOV(p, dst, brw_imm_f(1.0));
649 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
650 }
651
652 static void emit_max( struct brw_compile *p,
653 struct brw_reg dst,
654 struct brw_reg arg0,
655 struct brw_reg arg1 )
656 {
657 struct intel_context *intel = &p->brw->intel;
658
659 if (intel->gen >= 6) {
660 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
661 brw_SEL(p, dst, arg0, arg1);
662 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
663 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
664 } else {
665 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
666 brw_SEL(p, dst, arg0, arg1);
667 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
668 }
669 }
670
671 static void emit_min( struct brw_compile *p,
672 struct brw_reg dst,
673 struct brw_reg arg0,
674 struct brw_reg arg1 )
675 {
676 struct intel_context *intel = &p->brw->intel;
677
678 if (intel->gen >= 6) {
679 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
680 brw_SEL(p, dst, arg0, arg1);
681 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
682 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
683 } else {
684 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
685 brw_SEL(p, dst, arg0, arg1);
686 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
687 }
688 }
689
690 static void emit_arl(struct brw_compile *p,
691 struct brw_reg dst,
692 struct brw_reg src)
693 {
694 struct intel_context *intel = &p->brw->intel;
695
696 if (intel->gen >= 6) {
697 struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
698
699 brw_RNDD(p, dst_f, src);
700 brw_MOV(p, dst, dst_f);
701 } else {
702 brw_RNDD(p, dst, src);
703 }
704 }
705
706 static void emit_math1_gen4(struct brw_vs_compile *c,
707 GLuint function,
708 struct brw_reg dst,
709 struct brw_reg arg0,
710 GLuint precision)
711 {
712 /* There are various odd behaviours with SEND on the simulator. In
713 * addition there are documented issues with the fact that the GEN4
714 * processor doesn't do dependency control properly on SEND
715 * results. So, on balance, this kludge to get around failures
716 * with writemasked math results looks like it might be necessary
717 * whether that turns out to be a simulator bug or not:
718 */
719 struct brw_compile *p = &c->func;
720 struct brw_reg tmp = dst;
721 GLboolean need_tmp = GL_FALSE;
722
723 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
724 dst.dw1.bits.writemask != 0xf)
725 need_tmp = GL_TRUE;
726
727 if (need_tmp)
728 tmp = get_tmp(c);
729
730 brw_math(p,
731 tmp,
732 function,
733 BRW_MATH_SATURATE_NONE,
734 2,
735 arg0,
736 BRW_MATH_DATA_SCALAR,
737 precision);
738
739 if (need_tmp) {
740 brw_MOV(p, dst, tmp);
741 release_tmp(c, tmp);
742 }
743 }
744
745 static void
746 emit_math1_gen6(struct brw_vs_compile *c,
747 GLuint function,
748 struct brw_reg dst,
749 struct brw_reg arg0,
750 GLuint precision)
751 {
752 struct brw_compile *p = &c->func;
753 struct brw_reg tmp_src, tmp_dst;
754
755 /* Something is strange on gen6 math in 16-wide mode, though the
756 * docs say it's supposed to work. Punt to using align1 mode,
757 * which doesn't do writemasking and swizzles.
758 */
759 tmp_src = get_tmp(c);
760 tmp_dst = get_tmp(c);
761
762 brw_MOV(p, tmp_src, arg0);
763
764 brw_set_access_mode(p, BRW_ALIGN_1);
765 brw_math(p,
766 tmp_dst,
767 function,
768 BRW_MATH_SATURATE_NONE,
769 2,
770 tmp_src,
771 BRW_MATH_DATA_SCALAR,
772 precision);
773 brw_set_access_mode(p, BRW_ALIGN_16);
774
775 brw_MOV(p, dst, tmp_dst);
776
777 release_tmp(c, tmp_src);
778 release_tmp(c, tmp_dst);
779 }
780
781 static void
782 emit_math1(struct brw_vs_compile *c,
783 GLuint function,
784 struct brw_reg dst,
785 struct brw_reg arg0,
786 GLuint precision)
787 {
788 struct brw_compile *p = &c->func;
789 struct intel_context *intel = &p->brw->intel;
790
791 if (intel->gen >= 6)
792 emit_math1_gen6(c, function, dst, arg0, precision);
793 else
794 emit_math1_gen4(c, function, dst, arg0, precision);
795 }
796
797 static void emit_math2_gen4( struct brw_vs_compile *c,
798 GLuint function,
799 struct brw_reg dst,
800 struct brw_reg arg0,
801 struct brw_reg arg1,
802 GLuint precision)
803 {
804 struct brw_compile *p = &c->func;
805 struct brw_reg tmp = dst;
806 GLboolean need_tmp = GL_FALSE;
807
808 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
809 dst.dw1.bits.writemask != 0xf)
810 need_tmp = GL_TRUE;
811
812 if (need_tmp)
813 tmp = get_tmp(c);
814
815 brw_MOV(p, brw_message_reg(3), arg1);
816
817 brw_math(p,
818 tmp,
819 function,
820 BRW_MATH_SATURATE_NONE,
821 2,
822 arg0,
823 BRW_MATH_DATA_SCALAR,
824 precision);
825
826 if (need_tmp) {
827 brw_MOV(p, dst, tmp);
828 release_tmp(c, tmp);
829 }
830 }
831
832 static void emit_math2_gen6( struct brw_vs_compile *c,
833 GLuint function,
834 struct brw_reg dst,
835 struct brw_reg arg0,
836 struct brw_reg arg1,
837 GLuint precision)
838 {
839 struct brw_compile *p = &c->func;
840 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
841
842 tmp_src0 = get_tmp(c);
843 tmp_src1 = get_tmp(c);
844 tmp_dst = get_tmp(c);
845
846 brw_MOV(p, tmp_src0, arg0);
847 brw_MOV(p, tmp_src1, arg1);
848
849 brw_set_access_mode(p, BRW_ALIGN_1);
850 brw_math2(p,
851 tmp_dst,
852 function,
853 tmp_src0,
854 tmp_src1);
855 brw_set_access_mode(p, BRW_ALIGN_16);
856
857 brw_MOV(p, dst, tmp_dst);
858
859 release_tmp(c, tmp_src0);
860 release_tmp(c, tmp_src1);
861 release_tmp(c, tmp_dst);
862 }
863
864 static void emit_math2( struct brw_vs_compile *c,
865 GLuint function,
866 struct brw_reg dst,
867 struct brw_reg arg0,
868 struct brw_reg arg1,
869 GLuint precision)
870 {
871 struct brw_compile *p = &c->func;
872 struct intel_context *intel = &p->brw->intel;
873
874 if (intel->gen >= 6)
875 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
876 else
877 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
878 }
879
880 static void emit_exp_noalias( struct brw_vs_compile *c,
881 struct brw_reg dst,
882 struct brw_reg arg0 )
883 {
884 struct brw_compile *p = &c->func;
885
886
887 if (dst.dw1.bits.writemask & WRITEMASK_X) {
888 struct brw_reg tmp = get_tmp(c);
889 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
890
891 /* tmp_d = floor(arg0.x) */
892 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
893
894 /* result[0] = 2.0 ^ tmp */
895
896 /* Adjust exponent for floating point:
897 * exp += 127
898 */
899 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
900
901 /* Install exponent and sign.
902 * Excess drops off the edge:
903 */
904 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
905 tmp_d, brw_imm_d(23));
906
907 release_tmp(c, tmp);
908 }
909
910 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
911 /* result[1] = arg0.x - floor(arg0.x) */
912 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
913 }
914
915 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
916 /* As with the LOG instruction, we might be better off just
917 * doing a taylor expansion here, seeing as we have to do all
918 * the prep work.
919 *
920 * If mathbox partial precision is too low, consider also:
921 * result[3] = result[0] * EXP(result[1])
922 */
923 emit_math1(c,
924 BRW_MATH_FUNCTION_EXP,
925 brw_writemask(dst, WRITEMASK_Z),
926 brw_swizzle1(arg0, 0),
927 BRW_MATH_PRECISION_FULL);
928 }
929
930 if (dst.dw1.bits.writemask & WRITEMASK_W) {
931 /* result[3] = 1.0; */
932 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
933 }
934 }
935
936
937 static void emit_log_noalias( struct brw_vs_compile *c,
938 struct brw_reg dst,
939 struct brw_reg arg0 )
940 {
941 struct brw_compile *p = &c->func;
942 struct brw_reg tmp = dst;
943 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
944 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
945 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
946 dst.file != BRW_GENERAL_REGISTER_FILE);
947
948 if (need_tmp) {
949 tmp = get_tmp(c);
950 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
951 }
952
953 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
954 * according to spec:
955 *
956 * These almost look likey they could be joined up, but not really
957 * practical:
958 *
959 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
960 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
961 */
962 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
963 brw_AND(p,
964 brw_writemask(tmp_ud, WRITEMASK_X),
965 brw_swizzle1(arg0_ud, 0),
966 brw_imm_ud((1U<<31)-1));
967
968 brw_SHR(p,
969 brw_writemask(tmp_ud, WRITEMASK_X),
970 tmp_ud,
971 brw_imm_ud(23));
972
973 brw_ADD(p,
974 brw_writemask(tmp, WRITEMASK_X),
975 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
976 brw_imm_d(-127));
977 }
978
979 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
980 brw_AND(p,
981 brw_writemask(tmp_ud, WRITEMASK_Y),
982 brw_swizzle1(arg0_ud, 0),
983 brw_imm_ud((1<<23)-1));
984
985 brw_OR(p,
986 brw_writemask(tmp_ud, WRITEMASK_Y),
987 tmp_ud,
988 brw_imm_ud(127<<23));
989 }
990
991 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
992 /* result[2] = result[0] + LOG2(result[1]); */
993
994 /* Why bother? The above is just a hint how to do this with a
995 * taylor series. Maybe we *should* use a taylor series as by
996 * the time all the above has been done it's almost certainly
997 * quicker than calling the mathbox, even with low precision.
998 *
999 * Options are:
1000 * - result[0] + mathbox.LOG2(result[1])
1001 * - mathbox.LOG2(arg0.x)
1002 * - result[0] + inline_taylor_approx(result[1])
1003 */
1004 emit_math1(c,
1005 BRW_MATH_FUNCTION_LOG,
1006 brw_writemask(tmp, WRITEMASK_Z),
1007 brw_swizzle1(tmp, 1),
1008 BRW_MATH_PRECISION_FULL);
1009
1010 brw_ADD(p,
1011 brw_writemask(tmp, WRITEMASK_Z),
1012 brw_swizzle1(tmp, 2),
1013 brw_swizzle1(tmp, 0));
1014 }
1015
1016 if (dst.dw1.bits.writemask & WRITEMASK_W) {
1017 /* result[3] = 1.0; */
1018 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
1019 }
1020
1021 if (need_tmp) {
1022 brw_MOV(p, dst, tmp);
1023 release_tmp(c, tmp);
1024 }
1025 }
1026
1027
1028 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
1029 */
1030 static void emit_dst_noalias( struct brw_vs_compile *c,
1031 struct brw_reg dst,
1032 struct brw_reg arg0,
1033 struct brw_reg arg1)
1034 {
1035 struct brw_compile *p = &c->func;
1036
1037 /* There must be a better way to do this:
1038 */
1039 if (dst.dw1.bits.writemask & WRITEMASK_X)
1040 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
1041 if (dst.dw1.bits.writemask & WRITEMASK_Y)
1042 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
1043 if (dst.dw1.bits.writemask & WRITEMASK_Z)
1044 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
1045 if (dst.dw1.bits.writemask & WRITEMASK_W)
1046 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
1047 }
1048
1049
1050 static void emit_xpd( struct brw_compile *p,
1051 struct brw_reg dst,
1052 struct brw_reg t,
1053 struct brw_reg u)
1054 {
1055 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
1056 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1057 }
1058
1059
1060 static void emit_lit_noalias( struct brw_vs_compile *c,
1061 struct brw_reg dst,
1062 struct brw_reg arg0 )
1063 {
1064 struct brw_compile *p = &c->func;
1065 struct brw_reg tmp = dst;
1066 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1067
1068 if (need_tmp)
1069 tmp = get_tmp(c);
1070
1071 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1072 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1073
1074 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1075 * to get all channels active inside the IF. In the clipping code
1076 * we run with NoMask, so it's not an option and we can use
1077 * BRW_EXECUTE_1 for all comparisions.
1078 */
1079 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1080 brw_IF(p, BRW_EXECUTE_8);
1081 {
1082 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1083
1084 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1085 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1086 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1087
1088 emit_math2(c,
1089 BRW_MATH_FUNCTION_POW,
1090 brw_writemask(dst, WRITEMASK_Z),
1091 brw_swizzle1(tmp, 2),
1092 brw_swizzle1(arg0, 3),
1093 BRW_MATH_PRECISION_PARTIAL);
1094 }
1095 brw_ENDIF(p);
1096
1097 release_tmp(c, tmp);
1098 }
1099
1100 static void emit_lrp_noalias(struct brw_vs_compile *c,
1101 struct brw_reg dst,
1102 struct brw_reg arg0,
1103 struct brw_reg arg1,
1104 struct brw_reg arg2)
1105 {
1106 struct brw_compile *p = &c->func;
1107
1108 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1109 brw_MUL(p, brw_null_reg(), dst, arg2);
1110 brw_MAC(p, dst, arg0, arg1);
1111 }
1112
1113 static struct brw_reg
1114 get_constant(struct brw_vs_compile *c,
1115 const struct prog_instruction *inst,
1116 GLuint argIndex)
1117 {
1118 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1119 struct brw_compile *p = &c->func;
1120 struct brw_reg const_reg = c->current_const[argIndex].reg;
1121
1122 assert(argIndex < 3);
1123
1124 if (c->current_const[argIndex].index != src->Index) {
1125 /* Keep track of the last constant loaded in this slot, for reuse. */
1126 c->current_const[argIndex].index = src->Index;
1127
1128 #if 0
1129 printf(" fetch const[%d] for arg %d into reg %d\n",
1130 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1131 #endif
1132 /* need to fetch the constant now */
1133 brw_dp_READ_4_vs(p,
1134 const_reg, /* writeback dest */
1135 16 * src->Index, /* byte offset */
1136 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1137 );
1138 }
1139
1140 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1141 const_reg = stride(const_reg, 0, 4, 1);
1142 const_reg.subnr = 0;
1143
1144 return const_reg;
1145 }
1146
1147 static struct brw_reg
1148 get_reladdr_constant(struct brw_vs_compile *c,
1149 const struct prog_instruction *inst,
1150 GLuint argIndex)
1151 {
1152 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1153 struct brw_compile *p = &c->func;
1154 struct brw_context *brw = p->brw;
1155 struct intel_context *intel = &brw->intel;
1156 struct brw_reg const_reg = c->current_const[argIndex].reg;
1157 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1158 uint32_t offset;
1159
1160 assert(argIndex < 3);
1161
1162 /* Can't reuse a reladdr constant load. */
1163 c->current_const[argIndex].index = -1;
1164
1165 #if 0
1166 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1167 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1168 #endif
1169
1170 if (intel->gen >= 6) {
1171 offset = src->Index;
1172 } else {
1173 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1174 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1175 addr_reg = byte_addr_reg;
1176 offset = 16 * src->Index;
1177 }
1178
1179 /* fetch the first vec4 */
1180 brw_dp_READ_4_vs_relative(p,
1181 const_reg,
1182 addr_reg,
1183 offset,
1184 SURF_INDEX_VERT_CONST_BUFFER);
1185
1186 return const_reg;
1187 }
1188
1189
1190
1191 /* TODO: relative addressing!
1192 */
1193 static struct brw_reg get_reg( struct brw_vs_compile *c,
1194 gl_register_file file,
1195 GLuint index )
1196 {
1197 switch (file) {
1198 case PROGRAM_TEMPORARY:
1199 case PROGRAM_INPUT:
1200 case PROGRAM_OUTPUT:
1201 assert(c->regs[file][index].nr != 0);
1202 return c->regs[file][index];
1203 case PROGRAM_STATE_VAR:
1204 case PROGRAM_CONSTANT:
1205 case PROGRAM_UNIFORM:
1206 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1207 return c->regs[PROGRAM_STATE_VAR][index];
1208 case PROGRAM_ADDRESS:
1209 assert(index == 0);
1210 return c->regs[file][index];
1211
1212 case PROGRAM_UNDEFINED: /* undef values */
1213 return brw_null_reg();
1214
1215 case PROGRAM_LOCAL_PARAM:
1216 case PROGRAM_ENV_PARAM:
1217 case PROGRAM_WRITE_ONLY:
1218 default:
1219 assert(0);
1220 return brw_null_reg();
1221 }
1222 }
1223
1224
1225 /**
1226 * Indirect addressing: get reg[[arg] + offset].
1227 */
1228 static struct brw_reg deref( struct brw_vs_compile *c,
1229 struct brw_reg arg,
1230 GLint offset,
1231 GLuint reg_size )
1232 {
1233 struct brw_compile *p = &c->func;
1234 struct brw_reg tmp = get_tmp(c);
1235 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1236 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1237 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1238 struct brw_reg indirect = brw_vec4_indirect(0,0);
1239 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1240
1241 /* Set the vertical stride on the register access so that the first
1242 * 4 components come from a0.0 and the second 4 from a0.1.
1243 */
1244 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1245
1246 {
1247 brw_push_insn_state(p);
1248 brw_set_access_mode(p, BRW_ALIGN_1);
1249
1250 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1251 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1252
1253 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1254 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1255
1256 brw_MOV(p, tmp, indirect);
1257
1258 brw_pop_insn_state(p);
1259 }
1260
1261 /* NOTE: tmp not released */
1262 return tmp;
1263 }
1264
1265 static void
1266 move_to_reladdr_dst(struct brw_vs_compile *c,
1267 const struct prog_instruction *inst,
1268 struct brw_reg val)
1269 {
1270 struct brw_compile *p = &c->func;
1271 int reg_size = 32;
1272 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1273 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1274 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1275 GLuint byte_offset = base.nr * 32 + base.subnr;
1276 struct brw_reg indirect = brw_vec4_indirect(0,0);
1277 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1278
1279 /* Because destination register indirect addressing can only use
1280 * one index, we'll write each vertex's vec4 value separately.
1281 */
1282 val.width = BRW_WIDTH_4;
1283 val.vstride = BRW_VERTICAL_STRIDE_4;
1284
1285 brw_push_insn_state(p);
1286 brw_set_access_mode(p, BRW_ALIGN_1);
1287
1288 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1289 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1290 brw_MOV(p, indirect, val);
1291
1292 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1293 brw_ADD(p, brw_address_reg(0), acc,
1294 brw_imm_uw(byte_offset + reg_size / 2));
1295 brw_MOV(p, indirect, suboffset(val, 4));
1296
1297 brw_pop_insn_state(p);
1298 }
1299
1300 /**
1301 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1302 * TODO: relative addressing!
1303 */
1304 static struct brw_reg
1305 get_src_reg( struct brw_vs_compile *c,
1306 const struct prog_instruction *inst,
1307 GLuint argIndex )
1308 {
1309 const GLuint file = inst->SrcReg[argIndex].File;
1310 const GLint index = inst->SrcReg[argIndex].Index;
1311 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1312
1313 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1314 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1315
1316 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1317 SWIZZLE_ZERO,
1318 SWIZZLE_ZERO,
1319 SWIZZLE_ZERO)) {
1320 return brw_imm_f(0.0f);
1321 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1322 SWIZZLE_ONE,
1323 SWIZZLE_ONE,
1324 SWIZZLE_ONE)) {
1325 if (src->Negate)
1326 return brw_imm_f(-1.0F);
1327 else
1328 return brw_imm_f(1.0F);
1329 } else if (src->File == PROGRAM_CONSTANT) {
1330 const struct gl_program_parameter_list *params;
1331 float f;
1332 int component = -1;
1333
1334 switch (src->Swizzle) {
1335 case SWIZZLE_XXXX:
1336 component = 0;
1337 break;
1338 case SWIZZLE_YYYY:
1339 component = 1;
1340 break;
1341 case SWIZZLE_ZZZZ:
1342 component = 2;
1343 break;
1344 case SWIZZLE_WWWW:
1345 component = 3;
1346 break;
1347 }
1348
1349 if (component >= 0) {
1350 params = c->vp->program.Base.Parameters;
1351 f = params->ParameterValues[src->Index][component].f;
1352
1353 if (src->Abs)
1354 f = fabs(f);
1355 if (src->Negate)
1356 f = -f;
1357 return brw_imm_f(f);
1358 }
1359 }
1360 }
1361
1362 switch (file) {
1363 case PROGRAM_TEMPORARY:
1364 case PROGRAM_INPUT:
1365 case PROGRAM_OUTPUT:
1366 if (relAddr) {
1367 return deref(c, c->regs[file][0], index, 32);
1368 }
1369 else {
1370 assert(c->regs[file][index].nr != 0);
1371 return c->regs[file][index];
1372 }
1373
1374 case PROGRAM_STATE_VAR:
1375 case PROGRAM_CONSTANT:
1376 case PROGRAM_UNIFORM:
1377 case PROGRAM_ENV_PARAM:
1378 case PROGRAM_LOCAL_PARAM:
1379 if (!relAddr && c->constant_map[index] != -1) {
1380 /* Take from the push constant buffer if possible. */
1381 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1382 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1383 } else {
1384 /* Must be in the pull constant buffer then .*/
1385 assert(c->vp->use_const_buffer);
1386 if (relAddr)
1387 return get_reladdr_constant(c, inst, argIndex);
1388 else
1389 return get_constant(c, inst, argIndex);
1390 }
1391 case PROGRAM_ADDRESS:
1392 assert(index == 0);
1393 return c->regs[file][index];
1394
1395 case PROGRAM_UNDEFINED:
1396 /* this is a normal case since we loop over all three src args */
1397 return brw_null_reg();
1398
1399 case PROGRAM_WRITE_ONLY:
1400 default:
1401 assert(0);
1402 return brw_null_reg();
1403 }
1404 }
1405
1406 /**
1407 * Return the brw reg for the given instruction's src argument.
1408 * Will return mangled results for SWZ op. The emit_swz() function
1409 * ignores this result and recalculates taking extended swizzles into
1410 * account.
1411 */
1412 static struct brw_reg get_arg( struct brw_vs_compile *c,
1413 const struct prog_instruction *inst,
1414 GLuint argIndex )
1415 {
1416 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1417 struct brw_reg reg;
1418
1419 if (src->File == PROGRAM_UNDEFINED)
1420 return brw_null_reg();
1421
1422 reg = get_src_reg(c, inst, argIndex);
1423
1424 /* Convert 3-bit swizzle to 2-bit.
1425 */
1426 if (reg.file != BRW_IMMEDIATE_VALUE) {
1427 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1428 GET_SWZ(src->Swizzle, 1),
1429 GET_SWZ(src->Swizzle, 2),
1430 GET_SWZ(src->Swizzle, 3));
1431
1432 /* Note this is ok for non-swizzle ARB_vp instructions */
1433 reg.negate = src->Negate ? 1 : 0;
1434 }
1435
1436 return reg;
1437 }
1438
1439
1440 /**
1441 * Get brw register for the given program dest register.
1442 */
1443 static struct brw_reg get_dst( struct brw_vs_compile *c,
1444 struct prog_dst_register dst )
1445 {
1446 struct brw_reg reg;
1447
1448 switch (dst.File) {
1449 case PROGRAM_TEMPORARY:
1450 case PROGRAM_OUTPUT:
1451 /* register-indirect addressing is only 1x1, not VxH, for
1452 * destination regs. So, for RelAddr we'll return a temporary
1453 * for the dest and do a move of the result to the RelAddr
1454 * register after the instruction emit.
1455 */
1456 if (dst.RelAddr) {
1457 reg = get_tmp(c);
1458 } else {
1459 assert(c->regs[dst.File][dst.Index].nr != 0);
1460 reg = c->regs[dst.File][dst.Index];
1461 }
1462 break;
1463 case PROGRAM_ADDRESS:
1464 assert(dst.Index == 0);
1465 reg = c->regs[dst.File][dst.Index];
1466 break;
1467 case PROGRAM_UNDEFINED:
1468 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1469 reg = brw_null_reg();
1470 break;
1471 default:
1472 assert(0);
1473 reg = brw_null_reg();
1474 }
1475
1476 assert(reg.type != BRW_IMMEDIATE_VALUE);
1477 reg.dw1.bits.writemask = dst.WriteMask;
1478
1479 return reg;
1480 }
1481
1482
1483 static void emit_swz( struct brw_vs_compile *c,
1484 struct brw_reg dst,
1485 const struct prog_instruction *inst)
1486 {
1487 const GLuint argIndex = 0;
1488 const struct prog_src_register src = inst->SrcReg[argIndex];
1489 struct brw_compile *p = &c->func;
1490 GLuint zeros_mask = 0;
1491 GLuint ones_mask = 0;
1492 GLuint src_mask = 0;
1493 GLubyte src_swz[4];
1494 GLboolean need_tmp = (src.Negate &&
1495 dst.file != BRW_GENERAL_REGISTER_FILE);
1496 struct brw_reg tmp = dst;
1497 GLuint i;
1498
1499 if (need_tmp)
1500 tmp = get_tmp(c);
1501
1502 for (i = 0; i < 4; i++) {
1503 if (dst.dw1.bits.writemask & (1<<i)) {
1504 GLubyte s = GET_SWZ(src.Swizzle, i);
1505 switch (s) {
1506 case SWIZZLE_X:
1507 case SWIZZLE_Y:
1508 case SWIZZLE_Z:
1509 case SWIZZLE_W:
1510 src_mask |= 1<<i;
1511 src_swz[i] = s;
1512 break;
1513 case SWIZZLE_ZERO:
1514 zeros_mask |= 1<<i;
1515 break;
1516 case SWIZZLE_ONE:
1517 ones_mask |= 1<<i;
1518 break;
1519 }
1520 }
1521 }
1522
1523 /* Do src first, in case dst aliases src:
1524 */
1525 if (src_mask) {
1526 struct brw_reg arg0;
1527
1528 arg0 = get_src_reg(c, inst, argIndex);
1529
1530 arg0 = brw_swizzle(arg0,
1531 src_swz[0], src_swz[1],
1532 src_swz[2], src_swz[3]);
1533
1534 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1535 }
1536
1537 if (zeros_mask)
1538 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1539
1540 if (ones_mask)
1541 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1542
1543 if (src.Negate)
1544 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1545
1546 if (need_tmp) {
1547 brw_MOV(p, dst, tmp);
1548 release_tmp(c, tmp);
1549 }
1550 }
1551
1552 static int
1553 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1554 {
1555 struct intel_context *intel = &brw->intel;
1556
1557 if (intel->gen >= 6) {
1558 /* URB data written (does not include the message header reg) must
1559 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1560 * section 5.4.3.2.2: URB_INTERLEAVED.
1561 *
1562 * URB entries are allocated on a multiple of 1024 bits, so an
1563 * extra 128 bits written here to make the end align to 256 is
1564 * no problem.
1565 */
1566 if ((mlen % 2) != 1)
1567 mlen++;
1568 }
1569
1570 return mlen;
1571 }
1572
1573 /**
1574 * Post-vertex-program processing. Send the results to the URB.
1575 */
1576 static void emit_vertex_write( struct brw_vs_compile *c)
1577 {
1578 struct brw_compile *p = &c->func;
1579 struct brw_context *brw = p->brw;
1580 struct intel_context *intel = &brw->intel;
1581 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1582 struct brw_reg ndc;
1583 int eot;
1584 GLuint len_vertex_header = 2;
1585 int next_mrf, i;
1586 int msg_len;
1587
1588 if (c->key.copy_edgeflag) {
1589 brw_MOV(p,
1590 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1591 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1592 }
1593
1594 if (intel->gen < 6) {
1595 /* Build ndc coords */
1596 ndc = get_tmp(c);
1597 /* ndc = 1.0 / pos.w */
1598 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1599 /* ndc.xyz = pos * ndc */
1600 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1601 }
1602
1603 /* Update the header for point size, user clipping flags, and -ve rhw
1604 * workaround.
1605 */
1606 if (intel->gen >= 6) {
1607 struct brw_reg m1 = brw_message_reg(1);
1608
1609 /* On gen6, m1 has each value in a separate dword, so we never
1610 * need to mess with a temporary for computing the m1 value.
1611 */
1612 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1613 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1614 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1615 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1616 }
1617
1618 /* Set the user clip distances in dword 8-15. (m3-4)*/
1619 if (c->key.nr_userclip) {
1620 for (i = 0; i < c->key.nr_userclip; i++) {
1621 struct brw_reg m;
1622 if (i < 4)
1623 m = brw_message_reg(3);
1624 else
1625 m = brw_message_reg(4);
1626
1627 brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1628 }
1629 }
1630 } else if ((c->prog_data.outputs_written &
1631 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1632 c->key.nr_userclip || brw->has_negative_rhw_bug) {
1633 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1634 GLuint i;
1635
1636 brw_MOV(p, header1, brw_imm_ud(0));
1637
1638 brw_set_access_mode(p, BRW_ALIGN_16);
1639
1640 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1641 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1642 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1643 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1644 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1645 header1, brw_imm_ud(0x7ff<<8));
1646 }
1647
1648 for (i = 0; i < c->key.nr_userclip; i++) {
1649 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1650 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1651 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1652 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1653 }
1654
1655 /* i965 clipping workaround:
1656 * 1) Test for -ve rhw
1657 * 2) If set,
1658 * set ndc = (0,0,0,0)
1659 * set ucp[6] = 1
1660 *
1661 * Later, clipping will detect ucp[6] and ensure the primitive is
1662 * clipped against all fixed planes.
1663 */
1664 if (brw->has_negative_rhw_bug) {
1665 brw_CMP(p,
1666 vec8(brw_null_reg()),
1667 BRW_CONDITIONAL_L,
1668 brw_swizzle1(ndc, 3),
1669 brw_imm_f(0));
1670
1671 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1672 brw_MOV(p, ndc, brw_imm_f(0));
1673 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1674 }
1675
1676 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1677 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1678 brw_set_access_mode(p, BRW_ALIGN_16);
1679
1680 release_tmp(c, header1);
1681 }
1682 else {
1683 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1684 }
1685
1686 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1687 * of zeros followed by two sets of NDC coordinates:
1688 */
1689 brw_set_access_mode(p, BRW_ALIGN_1);
1690 brw_set_acc_write_control(p, 0);
1691
1692 /* The VUE layout is documented in Volume 2a. */
1693 if (intel->gen >= 6) {
1694 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1695 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1696 * dword 4-7 (m2) is the 4D space position
1697 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1698 * enabled.
1699 * m3 or 5 is the first vertex element data we fill, which is
1700 * the vertex position.
1701 */
1702 brw_MOV(p, brw_message_reg(2), pos);
1703 len_vertex_header = 1;
1704 if (c->key.nr_userclip > 0)
1705 len_vertex_header += 2;
1706 } else if (intel->gen == 5) {
1707 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1708 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1709 * dword 4-7 (m2) is the ndc position (set above)
1710 * dword 8-11 (m3) of the vertex header is the 4D space position
1711 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1712 * m6 is a pad so that the vertex element data is aligned
1713 * m7 is the first vertex data we fill, which is the vertex position.
1714 */
1715 brw_MOV(p, brw_message_reg(2), ndc);
1716 brw_MOV(p, brw_message_reg(3), pos);
1717 brw_MOV(p, brw_message_reg(7), pos);
1718 len_vertex_header = 6;
1719 } else {
1720 /* There are 8 dwords in VUE header pre-Ironlake:
1721 * dword 0-3 (m1) is indices, point width, clip flags.
1722 * dword 4-7 (m2) is ndc position (set above)
1723 *
1724 * dword 8-11 (m3) is the first vertex data, which we always have be the
1725 * vertex position.
1726 */
1727 brw_MOV(p, brw_message_reg(2), ndc);
1728 brw_MOV(p, brw_message_reg(3), pos);
1729 len_vertex_header = 2;
1730 }
1731
1732 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1733 next_mrf = 2 + len_vertex_header;
1734 for (i = 0; i < VERT_RESULT_MAX; i++) {
1735 if (c->first_overflow_output > 0 && i >= c->first_overflow_output)
1736 break;
1737 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
1738 continue;
1739 if (i == VERT_RESULT_PSIZ)
1740 continue;
1741
1742 if (i >= VERT_RESULT_TEX0 &&
1743 c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
1744 brw_MOV(p, brw_message_reg(next_mrf), c->regs[PROGRAM_OUTPUT][i]);
1745 next_mrf++;
1746 } else if (c->regs[PROGRAM_OUTPUT][i].file == BRW_MESSAGE_REGISTER_FILE) {
1747 next_mrf = c->regs[PROGRAM_OUTPUT][i].nr + 1;
1748 }
1749 }
1750
1751 eot = (c->first_overflow_output == 0);
1752
1753 /* Message header, plus VUE header, plus the (first set of) outputs. */
1754 msg_len = 1 + len_vertex_header + c->nr_outputs;
1755 msg_len = align_interleaved_urb_mlen(brw, msg_len);
1756 /* Any outputs beyond BRW_MAX_MRF should be past first_overflow_output */
1757 msg_len = MIN2(msg_len, (BRW_MAX_MRF - 1)),
1758
1759 brw_urb_WRITE(p,
1760 brw_null_reg(), /* dest */
1761 0, /* starting mrf reg nr */
1762 c->r0, /* src */
1763 0, /* allocate */
1764 1, /* used */
1765 msg_len,
1766 0, /* response len */
1767 eot, /* eot */
1768 eot, /* writes complete */
1769 0, /* urb destination offset */
1770 BRW_URB_SWIZZLE_INTERLEAVE);
1771
1772 if (c->first_overflow_output > 0) {
1773 /* Not all of the vertex outputs/results fit into the MRF.
1774 * Move the overflowed attributes from the GRF to the MRF and
1775 * issue another brw_urb_WRITE().
1776 */
1777 GLuint i, mrf = 1;
1778 for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
1779 if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
1780 /* move from GRF to MRF */
1781 brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]);
1782 mrf++;
1783 }
1784 }
1785
1786 brw_urb_WRITE(p,
1787 brw_null_reg(), /* dest */
1788 0, /* starting mrf reg nr */
1789 c->r0, /* src */
1790 0, /* allocate */
1791 1, /* used */
1792 align_interleaved_urb_mlen(brw, mrf),
1793 0, /* response len */
1794 1, /* eot */
1795 1, /* writes complete */
1796 14 / 2, /* urb destination offset */
1797 BRW_URB_SWIZZLE_INTERLEAVE);
1798 }
1799 }
1800
1801 static GLboolean
1802 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1803 {
1804 struct brw_compile *p = &c->func;
1805 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1806
1807 if (p->nr_insn == 0)
1808 return GL_FALSE;
1809
1810 if (val.address_mode != BRW_ADDRESS_DIRECT)
1811 return GL_FALSE;
1812
1813 if (val.negate || val.abs)
1814 return GL_FALSE;
1815
1816 switch (prev_insn->header.opcode) {
1817 case BRW_OPCODE_MOV:
1818 case BRW_OPCODE_MAC:
1819 case BRW_OPCODE_MUL:
1820 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1821 prev_insn->header.execution_size == val.width &&
1822 prev_insn->bits1.da1.dest_reg_file == val.file &&
1823 prev_insn->bits1.da1.dest_reg_type == val.type &&
1824 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1825 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1826 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1827 prev_insn->bits1.da16.dest_writemask == 0xf)
1828 return GL_TRUE;
1829 else
1830 return GL_FALSE;
1831 default:
1832 return GL_FALSE;
1833 }
1834 }
1835
1836 static uint32_t
1837 get_predicate(const struct prog_instruction *inst)
1838 {
1839 if (inst->DstReg.CondMask == COND_TR)
1840 return BRW_PREDICATE_NONE;
1841
1842 /* All of GLSL only produces predicates for COND_NE and one channel per
1843 * vector. Fail badly if someone starts doing something else, as it might
1844 * mean infinite looping or something.
1845 *
1846 * We'd like to support all the condition codes, but our hardware doesn't
1847 * quite match the Mesa IR, which is modeled after the NV extensions. For
1848 * those, the instruction may update the condition codes or not, then any
1849 * later instruction may use one of those condition codes. For gen4, the
1850 * instruction may update the flags register based on one of the condition
1851 * codes output by the instruction, and then further instructions may
1852 * predicate on that. We can probably support this, but it won't
1853 * necessarily be easy.
1854 */
1855 assert(inst->DstReg.CondMask == COND_NE);
1856
1857 switch (inst->DstReg.CondSwizzle) {
1858 case SWIZZLE_XXXX:
1859 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1860 case SWIZZLE_YYYY:
1861 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1862 case SWIZZLE_ZZZZ:
1863 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1864 case SWIZZLE_WWWW:
1865 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1866 default:
1867 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1868 inst->DstReg.CondMask);
1869 return BRW_PREDICATE_NORMAL;
1870 }
1871 }
1872
1873 static void
1874 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1875 {
1876 struct brw_compile *p = &c->func;
1877 int i;
1878
1879 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1880 if (!(c->prog_data.inputs_read & (1 << i)))
1881 continue;
1882
1883 if (c->key.gl_fixed_input_size[i] != 0) {
1884 struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1885
1886 brw_MUL(p,
1887 brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1888 reg, brw_imm_f(1.0 / 65536.0));
1889 }
1890 }
1891 }
1892
1893 /* Emit the vertex program instructions here.
1894 */
1895 void brw_old_vs_emit(struct brw_vs_compile *c )
1896 {
1897 #define MAX_IF_DEPTH 32
1898 #define MAX_LOOP_DEPTH 32
1899 struct brw_compile *p = &c->func;
1900 struct brw_context *brw = p->brw;
1901 struct intel_context *intel = &brw->intel;
1902 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1903 GLuint insn, loop_depth = 0;
1904 struct brw_instruction *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1905 int if_depth_in_loop[MAX_LOOP_DEPTH];
1906 const struct brw_indirect stack_index = brw_indirect(0, 0);
1907 GLuint index;
1908 GLuint file;
1909
1910 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1911 printf("vs-mesa:\n");
1912 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1913 GL_TRUE);
1914 printf("\n");
1915 }
1916
1917 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1918 brw_set_access_mode(p, BRW_ALIGN_16);
1919 if_depth_in_loop[loop_depth] = 0;
1920
1921 brw_set_acc_write_control(p, 1);
1922
1923 for (insn = 0; insn < nr_insns; insn++) {
1924 GLuint i;
1925 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1926
1927 /* Message registers can't be read, so copy the output into GRF
1928 * register if they are used in source registers
1929 */
1930 for (i = 0; i < 3; i++) {
1931 struct prog_src_register *src = &inst->SrcReg[i];
1932 GLuint index = src->Index;
1933 GLuint file = src->File;
1934 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1935 c->output_regs[index].used_in_src = GL_TRUE;
1936 }
1937
1938 switch (inst->Opcode) {
1939 case OPCODE_CAL:
1940 case OPCODE_RET:
1941 c->needs_stack = GL_TRUE;
1942 break;
1943 default:
1944 break;
1945 }
1946 }
1947
1948 /* Static register allocation
1949 */
1950 brw_vs_alloc_regs(c);
1951
1952 brw_vs_rescale_gl_fixed(c);
1953
1954 if (c->needs_stack)
1955 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1956
1957 for (insn = 0; insn < nr_insns; insn++) {
1958
1959 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1960 struct brw_reg args[3], dst;
1961 GLuint i;
1962
1963 #if 0
1964 printf("%d: ", insn);
1965 _mesa_print_instruction(inst);
1966 #endif
1967
1968 /* Get argument regs. SWZ is special and does this itself.
1969 */
1970 if (inst->Opcode != OPCODE_SWZ)
1971 for (i = 0; i < 3; i++) {
1972 const struct prog_src_register *src = &inst->SrcReg[i];
1973 index = src->Index;
1974 file = src->File;
1975 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1976 /* Can't just make get_arg "do the right thing" here because
1977 * other callers of get_arg and get_src_reg don't expect any
1978 * special behavior for the c->output_regs[index].used_in_src
1979 * case.
1980 */
1981 args[i] = c->output_regs[index].reg;
1982 args[i].dw1.bits.swizzle =
1983 BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1984 GET_SWZ(src->Swizzle, 1),
1985 GET_SWZ(src->Swizzle, 2),
1986 GET_SWZ(src->Swizzle, 3));
1987
1988 /* Note this is ok for non-swizzle ARB_vp instructions */
1989 args[i].negate = src->Negate ? 1 : 0;
1990 } else
1991 args[i] = get_arg(c, inst, i);
1992 }
1993
1994 /* Get dest regs. Note that it is possible for a reg to be both
1995 * dst and arg, given the static allocation of registers. So
1996 * care needs to be taken emitting multi-operation instructions.
1997 */
1998 index = inst->DstReg.Index;
1999 file = inst->DstReg.File;
2000 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
2001 /* Can't just make get_dst "do the right thing" here because other
2002 * callers of get_dst don't expect any special behavior for the
2003 * c->output_regs[index].used_in_src case.
2004 */
2005 dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
2006 else
2007 dst = get_dst(c, inst->DstReg);
2008
2009 if (inst->SaturateMode != SATURATE_OFF) {
2010 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
2011 inst->SaturateMode);
2012 }
2013
2014 switch (inst->Opcode) {
2015 case OPCODE_ABS:
2016 args[0].negate = false;
2017 brw_MOV(p, dst, brw_abs(args[0]));
2018 break;
2019 case OPCODE_ADD:
2020 brw_ADD(p, dst, args[0], args[1]);
2021 break;
2022 case OPCODE_COS:
2023 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
2024 break;
2025 case OPCODE_DP2:
2026 brw_DP2(p, dst, args[0], args[1]);
2027 break;
2028 case OPCODE_DP3:
2029 brw_DP3(p, dst, args[0], args[1]);
2030 break;
2031 case OPCODE_DP4:
2032 brw_DP4(p, dst, args[0], args[1]);
2033 break;
2034 case OPCODE_DPH:
2035 brw_DPH(p, dst, args[0], args[1]);
2036 break;
2037 case OPCODE_DST:
2038 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
2039 break;
2040 case OPCODE_EXP:
2041 unalias1(c, dst, args[0], emit_exp_noalias);
2042 break;
2043 case OPCODE_EX2:
2044 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
2045 break;
2046 case OPCODE_ARL:
2047 emit_arl(p, dst, args[0]);
2048 break;
2049 case OPCODE_FLR:
2050 brw_RNDD(p, dst, args[0]);
2051 break;
2052 case OPCODE_FRC:
2053 brw_FRC(p, dst, args[0]);
2054 break;
2055 case OPCODE_LOG:
2056 unalias1(c, dst, args[0], emit_log_noalias);
2057 break;
2058 case OPCODE_LG2:
2059 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2060 break;
2061 case OPCODE_LIT:
2062 unalias1(c, dst, args[0], emit_lit_noalias);
2063 break;
2064 case OPCODE_LRP:
2065 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2066 break;
2067 case OPCODE_MAD:
2068 if (!accumulator_contains(c, args[2]))
2069 brw_MOV(p, brw_acc_reg(), args[2]);
2070 brw_MAC(p, dst, args[0], args[1]);
2071 break;
2072 case OPCODE_CMP:
2073 emit_cmp(p, dst, args[0], args[1], args[2]);
2074 break;
2075 case OPCODE_MAX:
2076 emit_max(p, dst, args[0], args[1]);
2077 break;
2078 case OPCODE_MIN:
2079 emit_min(p, dst, args[0], args[1]);
2080 break;
2081 case OPCODE_MOV:
2082 brw_MOV(p, dst, args[0]);
2083 break;
2084 case OPCODE_MUL:
2085 brw_MUL(p, dst, args[0], args[1]);
2086 break;
2087 case OPCODE_POW:
2088 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2089 break;
2090 case OPCODE_RCP:
2091 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2092 break;
2093 case OPCODE_RSQ:
2094 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2095 break;
2096
2097 case OPCODE_SEQ:
2098 unalias2(c, dst, args[0], args[1], emit_seq);
2099 break;
2100 case OPCODE_SIN:
2101 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2102 break;
2103 case OPCODE_SNE:
2104 unalias2(c, dst, args[0], args[1], emit_sne);
2105 break;
2106 case OPCODE_SGE:
2107 unalias2(c, dst, args[0], args[1], emit_sge);
2108 break;
2109 case OPCODE_SGT:
2110 unalias2(c, dst, args[0], args[1], emit_sgt);
2111 break;
2112 case OPCODE_SLT:
2113 unalias2(c, dst, args[0], args[1], emit_slt);
2114 break;
2115 case OPCODE_SLE:
2116 unalias2(c, dst, args[0], args[1], emit_sle);
2117 break;
2118 case OPCODE_SSG:
2119 unalias1(c, dst, args[0], emit_sign);
2120 break;
2121 case OPCODE_SUB:
2122 brw_ADD(p, dst, args[0], negate(args[1]));
2123 break;
2124 case OPCODE_SWZ:
2125 /* The args[0] value can't be used here as it won't have
2126 * correctly encoded the full swizzle:
2127 */
2128 emit_swz(c, dst, inst);
2129 break;
2130 case OPCODE_TRUNC:
2131 /* round toward zero */
2132 brw_RNDZ(p, dst, args[0]);
2133 break;
2134 case OPCODE_XPD:
2135 emit_xpd(p, dst, args[0], args[1]);
2136 break;
2137 case OPCODE_IF: {
2138 struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2139 /* Note that brw_IF smashes the predicate_control field. */
2140 if_inst->header.predicate_control = get_predicate(inst);
2141 if_depth_in_loop[loop_depth]++;
2142 break;
2143 }
2144 case OPCODE_ELSE:
2145 clear_current_const(c);
2146 brw_ELSE(p);
2147 break;
2148 case OPCODE_ENDIF:
2149 clear_current_const(c);
2150 brw_ENDIF(p);
2151 if_depth_in_loop[loop_depth]--;
2152 break;
2153 case OPCODE_BGNLOOP:
2154 clear_current_const(c);
2155 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2156 if_depth_in_loop[loop_depth] = 0;
2157 break;
2158 case OPCODE_BRK:
2159 brw_set_predicate_control(p, get_predicate(inst));
2160 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2161 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2162 break;
2163 case OPCODE_CONT:
2164 brw_set_predicate_control(p, get_predicate(inst));
2165 if (intel->gen >= 6) {
2166 gen6_CONT(p, loop_inst[loop_depth - 1]);
2167 } else {
2168 brw_CONT(p, if_depth_in_loop[loop_depth]);
2169 }
2170 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2171 break;
2172
2173 case OPCODE_ENDLOOP: {
2174 clear_current_const(c);
2175 struct brw_instruction *inst0, *inst1;
2176 GLuint br = 1;
2177
2178 loop_depth--;
2179
2180 if (intel->gen == 5)
2181 br = 2;
2182
2183 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2184
2185 if (intel->gen < 6) {
2186 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2187 while (inst0 > loop_inst[loop_depth]) {
2188 inst0--;
2189 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2190 inst0->bits3.if_else.jump_count == 0) {
2191 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2192 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2193 inst0->bits3.if_else.jump_count == 0) {
2194 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2195 }
2196 }
2197 }
2198 }
2199 break;
2200
2201 case OPCODE_BRA:
2202 brw_set_predicate_control(p, get_predicate(inst));
2203 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2204 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2205 break;
2206 case OPCODE_CAL:
2207 brw_set_access_mode(p, BRW_ALIGN_1);
2208 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2209 brw_set_access_mode(p, BRW_ALIGN_16);
2210 brw_ADD(p, get_addr_reg(stack_index),
2211 get_addr_reg(stack_index), brw_imm_d(4));
2212 brw_save_call(p, inst->Comment, p->nr_insn);
2213 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2214 break;
2215 case OPCODE_RET:
2216 brw_ADD(p, get_addr_reg(stack_index),
2217 get_addr_reg(stack_index), brw_imm_d(-4));
2218 brw_set_access_mode(p, BRW_ALIGN_1);
2219 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2220 brw_set_access_mode(p, BRW_ALIGN_16);
2221 break;
2222 case OPCODE_END:
2223 emit_vertex_write(c);
2224 break;
2225 case OPCODE_PRINT:
2226 /* no-op */
2227 break;
2228 case OPCODE_BGNSUB:
2229 brw_save_label(p, inst->Comment, p->nr_insn);
2230 break;
2231 case OPCODE_ENDSUB:
2232 /* no-op */
2233 break;
2234 default:
2235 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2236 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2237 _mesa_opcode_string(inst->Opcode) :
2238 "unknown");
2239 }
2240
2241 /* Set the predication update on the last instruction of the native
2242 * instruction sequence.
2243 *
2244 * This would be problematic if it was set on a math instruction,
2245 * but that shouldn't be the case with the current GLSL compiler.
2246 */
2247 if (inst->CondUpdate) {
2248 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2249
2250 assert(hw_insn->header.destreg__conditionalmod == 0);
2251 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2252 }
2253
2254 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2255 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2256 && c->output_regs[inst->DstReg.Index].used_in_src) {
2257 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2258 }
2259
2260 /* Result color clamping.
2261 *
2262 * When destination register is an output register and
2263 * it's primary/secondary front/back color, we have to clamp
2264 * the result to [0,1]. This is done by enabling the
2265 * saturation bit for the last instruction.
2266 *
2267 * We don't use brw_set_saturate() as it modifies
2268 * p->current->header.saturate, which affects all the subsequent
2269 * instructions. Instead, we directly modify the header
2270 * of the last (already stored) instruction.
2271 */
2272 if (inst->DstReg.File == PROGRAM_OUTPUT &&
2273 c->key.clamp_vertex_color) {
2274 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2275 || (inst->DstReg.Index == VERT_RESULT_COL1)
2276 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2277 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2278 p->store[p->nr_insn-1].header.saturate = 1;
2279 }
2280 }
2281
2282 if (inst->DstReg.RelAddr) {
2283 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2284 inst->DstReg.File == PROGRAM_OUTPUT);
2285 move_to_reladdr_dst(c, inst, dst);
2286 }
2287
2288 release_tmps(c);
2289 }
2290
2291 brw_resolve_cals(p);
2292 brw_set_uip_jip(p);
2293
2294 brw_optimize(p);
2295
2296 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2297 int i;
2298
2299 printf("vs-native:\n");
2300 for (i = 0; i < p->nr_insn; i++)
2301 brw_disasm(stdout, &p->store[i], intel->gen);
2302 printf("\n");
2303 }
2304 }